From e1bf0b866f811bed095e30df78a3149fd92413f7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 7 Mar 2026 06:46:46 -0800
Subject: [PATCH 001/210] Update the macos workflow

---
 .github/workflows/build-everything-tgw.yml   | 2 +-
 .github/workflows/build-portable-release.yml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 78fd2d6b..9322f859 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -67,4 +67,4 @@ jobs:
     uses: ./.github/workflows/build-portable-release.yml
     with:
       version: ${{ inputs.version }}
-      config: 'os:macos-13,macos-14'
+      config: 'os:macos-15-intel,macos-14'
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index a6aec751..cc584fb2 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -120,7 +120,7 @@ jobs:
                 PACKAGES_PATH="portable_env/Lib/site-packages"
                 rm start_linux.sh start_macos.sh
             elif [[ "$RUNNER_OS" == "macOS" ]]; then
-                if [[ "$OS_TYPE" == "macos-13" ]]; then
+                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
                     PLATFORM="macos-x86_64"
                     PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only.tar.gz"
                     REQ_TYPE="apple_intel"
@@ -153,7 +153,7 @@ jobs:
 
             # Select requirements file based on platform
             if [[ "$RUNNER_OS" == "macOS" ]]; then
-                if [[ "$OS_TYPE" == "macos-13" ]]; then
+                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
                     REQ_FILE="requirements/portable/requirements_apple_intel.txt"
                 else
                     REQ_FILE="requirements/portable/requirements_apple_silicon.txt"

From 0cecc0a041353be0d81f8ed969f03cca396351e1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 7 Mar 2026 06:59:48 -0800
Subject: [PATCH 002/210] Use tar.gz for Linux/macOS portable builds to
 preserve symlinks

---
 .github/workflows/build-portable-release-cuda.yml | 15 ++++++++-------
 .github/workflows/build-portable-release-rocm.yml | 15 ++++++++-------
 .../workflows/build-portable-release-vulkan.yml   | 15 ++++++++-------
 .github/workflows/build-portable-release.yml      | 15 ++++++++-------
 4 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml
index 87ab7f9e..a5759112 100644
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@@ -150,15 +150,16 @@ jobs:
             # 5. Clean up
             rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
 
-            # 6. Create ZIP file
+            # 6. Create archive
             cd ..
-            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
-            echo "Creating archive: $ZIP_NAME"
-
             if [[ "$RUNNER_OS" == "Windows" ]]; then
-                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ZIP_NAME"
+                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
             else
-                zip -r "$ZIP_NAME" "text-generation-webui-${VERSION_CLEAN}"
+                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
             fi
 
       - name: Upload files to a GitHub release
@@ -167,7 +168,7 @@ jobs:
         continue-on-error: true
         with:
           repo_token: ${{ secrets.GITHUB_TOKEN }}
-          file: ../textgen-portable-*.zip
+          file: ../textgen-portable-*
           tag: ${{ inputs.version }}
           file_glob: true
           make_latest: false
diff --git a/.github/workflows/build-portable-release-rocm.yml b/.github/workflows/build-portable-release-rocm.yml
index 5b43b2d3..6f9ea4ec 100644
--- a/.github/workflows/build-portable-release-rocm.yml
+++ b/.github/workflows/build-portable-release-rocm.yml
@@ -145,15 +145,16 @@ jobs:
             # 5. Clean up
             rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
 
-            # 6. Create ZIP file
+            # 6. Create archive
             cd ..
-            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-rocm.zip"
-            echo "Creating archive: $ZIP_NAME"
-
             if [[ "$RUNNER_OS" == "Windows" ]]; then
-                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ZIP_NAME"
+                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-rocm.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
             else
-                zip -r "$ZIP_NAME" "text-generation-webui-${VERSION_CLEAN}"
+                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-rocm.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
             fi
 
       - name: Upload files to a GitHub release
@@ -162,7 +163,7 @@ jobs:
         continue-on-error: true
         with:
           repo_token: ${{ secrets.GITHUB_TOKEN }}
-          file: ../textgen-portable-*.zip
+          file: ../textgen-portable-*
           tag: ${{ inputs.version }}
           file_glob: true
           make_latest: false
diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml
index e8b75b5b..b98b2e5e 100644
--- a/.github/workflows/build-portable-release-vulkan.yml
+++ b/.github/workflows/build-portable-release-vulkan.yml
@@ -145,15 +145,16 @@ jobs:
             # 5. Clean up
             rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
 
-            # 6. Create ZIP file
+            # 6. Create archive
             cd ..
-            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-vulkan.zip"
-            echo "Creating archive: $ZIP_NAME"
-
             if [[ "$RUNNER_OS" == "Windows" ]]; then
-                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ZIP_NAME"
+                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-vulkan.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
             else
-                zip -r "$ZIP_NAME" "text-generation-webui-${VERSION_CLEAN}"
+                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-vulkan.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
             fi
 
       - name: Upload files to a GitHub release
@@ -162,7 +163,7 @@ jobs:
         continue-on-error: true
         with:
           repo_token: ${{ secrets.GITHUB_TOKEN }}
-          file: ../textgen-portable-*.zip
+          file: ../textgen-portable-*
           tag: ${{ inputs.version }}
           file_glob: true
           make_latest: false
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index cc584fb2..1bd4e163 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -171,15 +171,16 @@ jobs:
             # 5. Clean up
             rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
 
-            # 6. Create ZIP file
+            # 6. Create archive
             cd ..
-            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip"
-            echo "Creating archive: $ZIP_NAME"
-
             if [[ "$RUNNER_OS" == "Windows" ]]; then
-                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ZIP_NAME"
+                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
             else
-                zip -r "$ZIP_NAME" "text-generation-webui-${VERSION_CLEAN}"
+                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
             fi
 
       - name: Upload files to a GitHub release
@@ -188,7 +189,7 @@ jobs:
         continue-on-error: true
         with:
           repo_token: ${{ secrets.GITHUB_TOKEN }}
-          file: ../textgen-portable-*.zip
+          file: ../textgen-portable-*
           tag: ${{ inputs.version }}
           file_glob: true
           make_latest: false

From 6ff111d18e2f56f861fb6be54166b772f248fa33 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 7 Mar 2026 22:05:31 -0300
Subject: [PATCH 003/210] ExLlamav3: handle exceptions in ConcurrentGenerator
 iterate loop

---
 modules/exllamav3.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index b4b76e21..d9772682 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -53,7 +53,16 @@ class ConcurrentGenerator:
                 if not self.job_queues:
                     self.has_jobs.clear()
                     continue
-                results = self.generator.iterate()
+                try:
+                    results = self.generator.iterate()
+                except Exception:
+                    logger.error("Exception in ConcurrentGenerator iterate loop:\n" + traceback.format_exc())
+                    for q in self.job_queues.values():
+                        q.put(None)
+                    self.job_queues.clear()
+                    self.generator.clear_queue()
+                    self.has_jobs.clear()
+                    continue
             for result in results:
                 job = result["job"]
                 q = self.job_queues.get(job)

From baf4e13ff147246f00c3947f14a701deac88f4ea Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 7 Mar 2026 22:34:48 -0300
Subject: [PATCH 004/210] ExLlamav3: fix draft cache size to match main cache

---
 modules/exllamav3.py | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index d9772682..9ea38432 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -175,23 +175,8 @@ class Exllamav3Model:
                 logger.warning(f"Draft model not found at {draft_path}, speculative decoding disabled.")
             else:
                 draft_config = Config.from_directory(str(draft_path))
-
-                # Set context size for draft model with 256-multiple validation
-                if shared.args.ctx_size_draft > 0:
-                    draft_max_tokens = shared.args.ctx_size_draft
-                else:
-                    draft_max_tokens = shared.args.ctx_size
-
-                # Validate draft model context size is a multiple of 256
-                if draft_max_tokens % 256 != 0:
-                    adjusted_draft_tokens = ((draft_max_tokens // 256) + 1) * 256
-                    logger.warning(f"Draft model max_num_tokens must be a multiple of 256. Adjusting from {draft_max_tokens} to {adjusted_draft_tokens}")
-                    draft_max_tokens = adjusted_draft_tokens
-
-                draft_config.max_seq_len = draft_max_tokens
-
                 draft_model = Model.from_config(draft_config)
-                draft_cache = Cache(draft_model, max_num_tokens=draft_max_tokens, layer_type=layer_type, **cache_kwargs)
+                draft_cache = Cache(draft_model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs)
 
                 draft_load_params = {'progressbar': True}
                 if split:

From 0132966d09abc8a3fc3ffba3db2a0069e430fd47 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 7 Mar 2026 23:06:15 -0300
Subject: [PATCH 005/210] Add PyPI fallback for PyTorch install commands

---
 one_click.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/one_click.py b/one_click.py
index efb07134..5131206e 100644
--- a/one_click.py
+++ b/one_click.py
@@ -111,13 +111,14 @@ def get_gpu_choice():
 def get_pytorch_install_command(gpu_choice):
     """Get PyTorch installation command based on GPU choice"""
     base_cmd = f"python -m pip install torch=={TORCH_VERSION} "
+    pypi_fallback = " --extra-index-url https://pypi.org/simple/"
 
     if gpu_choice == "NVIDIA_CUDA128":
-        return base_cmd + "--index-url https://download.pytorch.org/whl/cu128"
+        return base_cmd + "--index-url https://download.pytorch.org/whl/cu128" + pypi_fallback
     elif gpu_choice == "AMD":
-        return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.4"
+        return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.4" + pypi_fallback
     elif gpu_choice in ["APPLE", "NONE"]:
-        return base_cmd + "--index-url https://download.pytorch.org/whl/cpu"
+        return base_cmd + "--index-url https://download.pytorch.org/whl/cpu" + pypi_fallback
     elif gpu_choice == "INTEL":
         if is_linux():
             return "python -m pip install torch==2.1.0a0 intel-extension-for-pytorch==2.1.10+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
@@ -130,16 +131,17 @@ def get_pytorch_install_command(gpu_choice):
 def get_pytorch_update_command(gpu_choice):
     """Get PyTorch update command based on GPU choice"""
     base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} "
+    pypi_fallback = " --extra-index-url https://pypi.org/simple/"
 
     if gpu_choice == "NVIDIA_CUDA128":
-        return f"{base_cmd} --index-url https://download.pytorch.org/whl/cu128"
+        return f"{base_cmd}--index-url https://download.pytorch.org/whl/cu128" + pypi_fallback
     elif gpu_choice == "AMD":
-        return f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.4"
+        return f"{base_cmd}--index-url https://download.pytorch.org/whl/rocm6.4" + pypi_fallback
     elif gpu_choice in ["APPLE", "NONE"]:
-        return f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
+        return f"{base_cmd}--index-url https://download.pytorch.org/whl/cpu" + pypi_fallback
     elif gpu_choice == "INTEL":
         intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
-        return f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
+        return f"{base_cmd}{intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
     else:
         return base_cmd
 

From b3705d87bfceda3ad6be09473e0772b67e897dd2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 7 Mar 2026 23:06:15 -0300
Subject: [PATCH 006/210] Add PyPI fallback for PyTorch install commands

---
 one_click.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/one_click.py b/one_click.py
index efb07134..5131206e 100644
--- a/one_click.py
+++ b/one_click.py
@@ -111,13 +111,14 @@ def get_gpu_choice():
 def get_pytorch_install_command(gpu_choice):
     """Get PyTorch installation command based on GPU choice"""
     base_cmd = f"python -m pip install torch=={TORCH_VERSION} "
+    pypi_fallback = " --extra-index-url https://pypi.org/simple/"
 
     if gpu_choice == "NVIDIA_CUDA128":
-        return base_cmd + "--index-url https://download.pytorch.org/whl/cu128"
+        return base_cmd + "--index-url https://download.pytorch.org/whl/cu128" + pypi_fallback
     elif gpu_choice == "AMD":
-        return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.4"
+        return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.4" + pypi_fallback
     elif gpu_choice in ["APPLE", "NONE"]:
-        return base_cmd + "--index-url https://download.pytorch.org/whl/cpu"
+        return base_cmd + "--index-url https://download.pytorch.org/whl/cpu" + pypi_fallback
     elif gpu_choice == "INTEL":
         if is_linux():
             return "python -m pip install torch==2.1.0a0 intel-extension-for-pytorch==2.1.10+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
@@ -130,16 +131,17 @@ def get_pytorch_install_command(gpu_choice):
 def get_pytorch_update_command(gpu_choice):
     """Get PyTorch update command based on GPU choice"""
     base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} "
+    pypi_fallback = " --extra-index-url https://pypi.org/simple/"
 
     if gpu_choice == "NVIDIA_CUDA128":
-        return f"{base_cmd} --index-url https://download.pytorch.org/whl/cu128"
+        return f"{base_cmd}--index-url https://download.pytorch.org/whl/cu128" + pypi_fallback
     elif gpu_choice == "AMD":
-        return f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.4"
+        return f"{base_cmd}--index-url https://download.pytorch.org/whl/rocm6.4" + pypi_fallback
     elif gpu_choice in ["APPLE", "NONE"]:
-        return f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
+        return f"{base_cmd}--index-url https://download.pytorch.org/whl/cpu" + pypi_fallback
     elif gpu_choice == "INTEL":
         intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
-        return f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
+        return f"{base_cmd}{intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
     else:
         return base_cmd
 

From 7170a16b91dab7e5de5cd01e01d5239050995474 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 8 Mar 2026 04:09:18 -0700
Subject: [PATCH 007/210] Fix passing adaptive-p to llama-server

---
 modules/llama_cpp_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 12ff173e..6f7cbd20 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -129,7 +129,7 @@ class LlamaServer:
             # places it at the end of the chain regardless of position, so we
             # activate it based on the parameter value rather than sampler order.
             if state.get("adaptive_target", 0) > 0:
-                filtered_samplers.append("adaptive-p")
+                filtered_samplers.append("adaptive_p")
 
             payload["samplers"] = filtered_samplers
 

From 7a8ca9f2b01d5904149248715ba2cbd93b996bb6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 8 Mar 2026 04:09:18 -0700
Subject: [PATCH 008/210] Fix passing adaptive-p to llama-server

---
 modules/llama_cpp_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 12ff173e..6f7cbd20 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -129,7 +129,7 @@ class LlamaServer:
             # places it at the end of the chain regardless of position, so we
             # activate it based on the parameter value rather than sampler order.
             if state.get("adaptive_target", 0) > 0:
-                filtered_samplers.append("adaptive-p")
+                filtered_samplers.append("adaptive_p")
 
             payload["samplers"] = filtered_samplers
 

From 5a91b8462f9ecc487ee5fc28d4875c58a40bb7d7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 8 Mar 2026 09:53:48 -0300
Subject: [PATCH 009/210] Remove ctx_size_draft from ExLlamav3 loader

---
 modules/loaders.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index 42a5ff1c..64de3dde 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -71,7 +71,6 @@ loaders_and_params = OrderedDict({
         'gpu_split',
         'model_draft',
         'draft_max',
-        'ctx_size_draft',
         'speculative_decoding_accordion',
         'enable_tp',
         'tp_backend',

From f6ffecfff2f06e6fead00ca0895971d162c0c68b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 8 Mar 2026 10:46:51 -0300
Subject: [PATCH 010/210] Add guard against training with llama.cpp loader

---
 docs/05 - Training Tab.md | 2 +-
 modules/training.py       | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/05 - Training Tab.md b/docs/05 - Training Tab.md
index 902693e6..0bfc59aa 100644
--- a/docs/05 - Training Tab.md	
+++ b/docs/05 - Training Tab.md	
@@ -4,7 +4,7 @@ A LoRA is tied to a specific model architecture — a LoRA trained on Llama 3 8B
 
 ### Quick Start
 
-1. Load your base model (no LoRAs loaded).
+1. Load your base model with the **Transformers** loader (no LoRAs loaded).
 2. Open the **Training** tab > **Train LoRA**.
 3. Pick a dataset and configure parameters (see [below](#parameters)).
 4. Click **Start LoRA Training** and monitor the [loss](#loss).
diff --git a/modules/training.py b/modules/training.py
index 2e172d22..878bb222 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -310,6 +310,11 @@ def do_train(lora_name: str, always_override: bool, all_linear: bool, q_proj_en:
 
     # == Input validation / processing ==
     yield "Preparing the input..."
+
+    if shared.args.loader == 'llama.cpp':
+        yield "Error: LoRA training requires a model loaded with the Transformers loader. GGUF models are not supported for training."
+        return
+
     lora_file_path = clean_path(None, lora_name)
     if lora_file_path.strip() == '':
         yield "Missing or invalid LoRA file name input."

From 40f1837b42ac0c444b86dc937a565d3a6d03448a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 8 Mar 2026 08:38:29 -0700
Subject: [PATCH 011/210] README: Minor updates

---
 README.md         | 4 ++--
 modules/shared.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a47a0e6f..a80ebc62 100644
--- a/README.md
+++ b/README.md
@@ -201,7 +201,7 @@ ln -s docker/{nvidia/Dockerfile,nvidia/docker-compose.yml,.dockerignore} .
 For AMD GPU:
 ln -s docker/{amd/Dockerfile,amd/docker-compose.yml,.dockerignore} .
 For Intel GPU:
-ln -s docker/{intel/Dockerfile,amd/docker-compose.yml,.dockerignore} .
+ln -s docker/{intel/Dockerfile,intel/docker-compose.yml,.dockerignore} .
 For CPU only
 ln -s docker/{cpu/Dockerfile,cpu/docker-compose.yml,.dockerignore} .
 cp docker/.env.example .env
@@ -365,7 +365,7 @@ Gradio:
 
 API:
   --api                                                Enable the API extension.
-  --public-api                                         Create a public URL for the API using Cloudfare.
+  --public-api                                         Create a public URL for the API using Cloudflare.
   --public-api-id PUBLIC_API_ID                        Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.
   --api-port API_PORT                                  The listening port for the API.
   --api-key API_KEY                                    API authentication key.
diff --git a/modules/shared.py b/modules/shared.py
index bc7ea8ba..080874ec 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -163,7 +163,7 @@ group.add_argument('--portable', action='store_true', help='Hide features not av
 # API
 group = parser.add_argument_group('API')
 group.add_argument('--api', action='store_true', help='Enable the API extension.')
-group.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudfare.')
+group.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudflare.')
 group.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.', default=None)
 group.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.')
 group.add_argument('--api-key', type=str, default='', help='API authentication key.')

From 634609accaca98ed750bc28ca9f3bf20eb08067a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 8 Mar 2026 20:23:15 -0700
Subject: [PATCH 012/210] Fix pip installing to system Miniconda on Windows,
 revert 0132966d

---
 one_click.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/one_click.py b/one_click.py
index 5131206e..cbdeadca 100644
--- a/one_click.py
+++ b/one_click.py
@@ -111,14 +111,13 @@ def get_gpu_choice():
 def get_pytorch_install_command(gpu_choice):
     """Get PyTorch installation command based on GPU choice"""
     base_cmd = f"python -m pip install torch=={TORCH_VERSION} "
-    pypi_fallback = " --extra-index-url https://pypi.org/simple/"
 
     if gpu_choice == "NVIDIA_CUDA128":
-        return base_cmd + "--index-url https://download.pytorch.org/whl/cu128" + pypi_fallback
+        return base_cmd + "--index-url https://download.pytorch.org/whl/cu128"
     elif gpu_choice == "AMD":
-        return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.4" + pypi_fallback
+        return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.4"
     elif gpu_choice in ["APPLE", "NONE"]:
-        return base_cmd + "--index-url https://download.pytorch.org/whl/cpu" + pypi_fallback
+        return base_cmd + "--index-url https://download.pytorch.org/whl/cpu"
     elif gpu_choice == "INTEL":
         if is_linux():
             return "python -m pip install torch==2.1.0a0 intel-extension-for-pytorch==2.1.10+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
@@ -131,17 +130,16 @@ def get_pytorch_install_command(gpu_choice):
 def get_pytorch_update_command(gpu_choice):
     """Get PyTorch update command based on GPU choice"""
     base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} "
-    pypi_fallback = " --extra-index-url https://pypi.org/simple/"
 
     if gpu_choice == "NVIDIA_CUDA128":
-        return f"{base_cmd}--index-url https://download.pytorch.org/whl/cu128" + pypi_fallback
+        return f"{base_cmd} --index-url https://download.pytorch.org/whl/cu128"
     elif gpu_choice == "AMD":
-        return f"{base_cmd}--index-url https://download.pytorch.org/whl/rocm6.4" + pypi_fallback
+        return f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.4"
     elif gpu_choice in ["APPLE", "NONE"]:
-        return f"{base_cmd}--index-url https://download.pytorch.org/whl/cpu" + pypi_fallback
+        return f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
     elif gpu_choice == "INTEL":
         intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
-        return f"{base_cmd}{intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
+        return f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
     else:
         return base_cmd
 
@@ -196,6 +194,8 @@ def run_cmd(cmd, assert_success=False, environment=False, capture_output=False,
     if environment:
         if is_windows():
             conda_bat_path = os.path.join(script_dir, "installer_files", "conda", "condabin", "conda.bat")
+            python_path = os.path.join(conda_env_path, "python.exe")
+            cmd = cmd.replace("python ", f'"{python_path}" ')
             cmd = f'"{conda_bat_path}" activate "{conda_env_path}" >nul && {cmd}'
         else:
             conda_sh_path = os.path.join(script_dir, "installer_files", "conda", "etc", "profile.d", "conda.sh")

From eb4a20137a4d4200d4deb05bd22dd5bc1ff2ce9f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 8 Mar 2026 20:38:50 -0700
Subject: [PATCH 013/210] Update README

---
 README.md | 85 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 68 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index a80ebc62..7831ef65 100644
--- a/README.md
+++ b/README.md
@@ -236,20 +236,25 @@ List of command-line flags
 </summary>
 
 ```txt
-usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
+usage: server.py [-h] [--user-data-dir USER_DATA_DIR] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
                  [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--image-model IMAGE_MODEL] [--image-model-dir IMAGE_MODEL_DIR] [--image-dtype {bfloat16,float16}]
                  [--image-attn-backend {flash_attention_2,sdpa}] [--image-cpu-offload] [--image-compile] [--image-quant {none,bnb-8bit,bnb-4bit,torchao-int8wo,torchao-fp4,torchao-float8wo}]
                  [--loader LOADER] [--ctx-size N] [--cache-type N] [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT]
                  [--ctx-size-draft CTX_SIZE_DRAFT] [--spec-type {none,ngram-mod,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-cache}] [--spec-ngram-size-n SPEC_NGRAM_SIZE_N]
                  [--spec-ngram-size-m SPEC_NGRAM_SIZE_M] [--spec-ngram-min-hits SPEC_NGRAM_MIN_HITS] [--gpu-layers N] [--cpu-moe] [--mmproj MMPROJ] [--streaming-llm] [--tensor-split TENSOR_SPLIT]
                  [--row-split] [--no-mmap] [--mlock] [--no-kv-offload] [--batch-size BATCH_SIZE] [--ubatch-size UBATCH_SIZE] [--threads THREADS] [--threads-batch THREADS_BATCH] [--numa]
-                 [--extra-flags EXTRA_FLAGS] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code]
-                 [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE]
-                 [--gpu-split GPU_SPLIT] [--enable-tp] [--tp-backend TP_BACKEND] [--cfg-cache] [--cpp-runner]
-                 [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share]
-                 [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors]
-                 [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4]
-                 [--nowebui]
+                 [--parallel PARALLEL] [--fit-target FIT_TARGET] [--extra-flags EXTRA_FLAGS] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16]
+                 [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE]
+                 [--quant_type QUANT_TYPE] [--gpu-split GPU_SPLIT] [--enable-tp] [--tp-backend TP_BACKEND] [--cfg-cache] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
+                 [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
+                 [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api]
+                 [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] [--temperature N]
+                 [--dynatemp-low N] [--dynatemp-high N] [--dynatemp-exponent N] [--smoothing-factor N] [--smoothing-curve N] [--min-p N] [--top-p N] [--top-k N] [--typical-p N] [--xtc-threshold N]
+                 [--xtc-probability N] [--epsilon-cutoff N] [--eta-cutoff N] [--tfs N] [--top-a N] [--top-n-sigma N] [--adaptive-target N] [--adaptive-decay N] [--dry-multiplier N]
+                 [--dry-allowed-length N] [--dry-base N] [--repetition-penalty N] [--frequency-penalty N] [--presence-penalty N] [--encoder-repetition-penalty N] [--no-repeat-ngram-size N]
+                 [--repetition-penalty-range N] [--penalty-alpha N] [--guidance-scale N] [--mirostat-mode N] [--mirostat-tau N] [--mirostat-eta N] [--do-sample | --no-do-sample]
+                 [--dynamic-temperature | --no-dynamic-temperature] [--temperature-last | --no-temperature-last] [--sampler-priority N] [--dry-sequence-breakers N]
+                 [--enable-thinking | --no-enable-thinking] [--reasoning-effort N] [--chat-template-file CHAT_TEMPLATE_FILE]
 
 Text Generation Web UI
 
@@ -257,6 +262,7 @@ options:
   -h, --help                                           show this help message and exit
 
 Basic settings:
+  --user-data-dir USER_DATA_DIR                        Path to the user data directory. Default: auto-detected.
   --multi-user                                         Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.
   --model MODEL                                        Name of the model to load by default.
   --lora LORA [LORA ...]                               The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
@@ -280,12 +286,12 @@ Image model:
                                                        Quantization method for image model.
 
 Model loader:
-  --loader LOADER                                      Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav3,
-                                                       TensorRT-LLM.
+  --loader LOADER                                      Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav3, TensorRT-
+                                                       LLM.
 
 Context and cache:
-  --ctx-size N, --n_ctx N, --max_seq_len N             Context size in tokens. llama.cpp: 0 = auto if gpu-layers is also -1.
-  --cache-type N, --cache_type N                       KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).
+  --ctx-size, --n_ctx, --max_seq_len N                 Context size in tokens. llama.cpp: 0 = auto if gpu-layers is also -1.
+  --cache-type, --cache_type N                         KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).
 
 Speculative decoding:
   --model-draft MODEL_DRAFT                            Path to the draft model for speculative decoding.
@@ -300,7 +306,7 @@ Speculative decoding:
   --spec-ngram-min-hits SPEC_NGRAM_MIN_HITS            Minimum n-gram hits for ngram-map speculative decoding.
 
 llama.cpp:
-  --gpu-layers N, --n-gpu-layers N                     Number of layers to offload to the GPU. -1 = auto.
+  --gpu-layers, --n-gpu-layers N                       Number of layers to offload to the GPU. -1 = auto.
   --cpu-moe                                            Move the experts to the CPU (for MoE models).
   --mmproj MMPROJ                                      Path to the mmproj file for vision models.
   --streaming-llm                                      Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
@@ -314,13 +320,17 @@ llama.cpp:
   --threads THREADS                                    Number of threads to use.
   --threads-batch THREADS_BATCH                        Number of threads to use for batches/prompt processing.
   --numa                                               Activate NUMA task allocation for llama.cpp.
+  --parallel PARALLEL                                  Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set
+                                                       ctx_size to 32768.
+  --fit-target FIT_TARGET                              Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.
+                                                       Default: 1024.
   --extra-flags EXTRA_FLAGS                            Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
 
 Transformers/Accelerate:
   --cpu                                                Use the CPU to generate text. Warning: Training on CPU is extremely slow.
   --cpu-memory CPU_MEMORY                              Maximum CPU memory in GiB. Use this for CPU offloading.
   --disk                                               If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.
-  --disk-cache-dir DISK_CACHE_DIR                      Directory to save the disk cache to. Defaults to "user_data/cache".
+  --disk-cache-dir DISK_CACHE_DIR                      Directory to save the disk cache to.
   --load-in-8bit                                       Load the model with 8-bit precision (using bitsandbytes).
   --bf16                                               Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
   --no-cache                                           Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.
@@ -341,9 +351,6 @@ ExLlamaV3:
   --tp-backend TP_BACKEND                              The backend for tensor parallelism. Valid options: native, nccl. Default: native.
   --cfg-cache                                          Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
 
-TensorRT-LLM:
-  --cpp-runner                                         Use the ModelRunnerCpp runner, which is faster than the default ModelRunner.
-
 RoPE:
   --alpha_value ALPHA_VALUE                            Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
   --rope_freq_base ROPE_FREQ_BASE                      If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
@@ -373,6 +380,50 @@ API:
   --api-enable-ipv6                                    Enable IPv6 for the API
   --api-disable-ipv4                                   Disable IPv4 for the API
   --nowebui                                            Do not launch the Gradio UI. Useful for launching the API in standalone mode.
+
+API generation defaults:
+  --temperature N                                      Temperature
+  --dynatemp-low N                                     Dynamic temperature low
+  --dynatemp-high N                                    Dynamic temperature high
+  --dynatemp-exponent N                                Dynamic temperature exponent
+  --smoothing-factor N                                 Smoothing factor
+  --smoothing-curve N                                  Smoothing curve
+  --min-p N                                            Min P
+  --top-p N                                            Top P
+  --top-k N                                            Top K
+  --typical-p N                                        Typical P
+  --xtc-threshold N                                    XTC threshold
+  --xtc-probability N                                  XTC probability
+  --epsilon-cutoff N                                   Epsilon cutoff
+  --eta-cutoff N                                       Eta cutoff
+  --tfs N                                              TFS
+  --top-a N                                            Top A
+  --top-n-sigma N                                      Top N Sigma
+  --adaptive-target N                                  Adaptive target
+  --adaptive-decay N                                   Adaptive decay
+  --dry-multiplier N                                   DRY multiplier
+  --dry-allowed-length N                               DRY allowed length
+  --dry-base N                                         DRY base
+  --repetition-penalty N                               Repetition penalty
+  --frequency-penalty N                                Frequency penalty
+  --presence-penalty N                                 Presence penalty
+  --encoder-repetition-penalty N                       Encoder repetition penalty
+  --no-repeat-ngram-size N                             No repeat ngram size
+  --repetition-penalty-range N                         Repetition penalty range
+  --penalty-alpha N                                    Penalty alpha
+  --guidance-scale N                                   Guidance scale
+  --mirostat-mode N                                    Mirostat mode
+  --mirostat-tau N                                     Mirostat tau
+  --mirostat-eta N                                     Mirostat eta
+  --do-sample, --no-do-sample                          Do sample
+  --dynamic-temperature, --no-dynamic-temperature      Dynamic temperature
+  --temperature-last, --no-temperature-last            Temperature last
+  --sampler-priority N                                 Sampler priority
+  --dry-sequence-breakers N                            DRY sequence breakers
+  --enable-thinking, --no-enable-thinking              Enable thinking
+  --reasoning-effort N                                 Reasoning effort
+  --chat-template-file CHAT_TEMPLATE_FILE              Path to a chat template file (.jinja, .jinja2, or .yaml) to use as the default instruction template for API requests. Overrides the model's
+                                                       built-in template.
 ```
 
 </details>

From 9753b2342b3ffedd3a79e675d6ab5bf70400fef8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 9 Mar 2026 16:22:04 -0300
Subject: [PATCH 014/210] Fix crash on non-UTF-8 Windows locales (e.g. Chinese
 GBK)

Closes #7416
---
 cmd_windows.bat   | 1 +
 start_windows.bat | 1 +
 2 files changed, 2 insertions(+)

diff --git a/cmd_windows.bat b/cmd_windows.bat
index 787b4335..b0540bd8 100755
--- a/cmd_windows.bat
+++ b/cmd_windows.bat
@@ -21,6 +21,7 @@ set INSTALL_ENV_DIR=%cd%\installer_files\env
 set PYTHONNOUSERSITE=1
 set PYTHONPATH=
 set PYTHONHOME=
+set PYTHONUTF8=1
 set "CUDA_PATH=%INSTALL_ENV_DIR%"
 set "CUDA_HOME=%CUDA_PATH%"
 
diff --git a/start_windows.bat b/start_windows.bat
index dd096760..8da6986f 100755
--- a/start_windows.bat
+++ b/start_windows.bat
@@ -5,6 +5,7 @@ setlocal enabledelayedexpansion
 set PYTHONNOUSERSITE=1
 set PYTHONPATH=
 set PYTHONHOME=
+set PYTHONUTF8=1
 
 cd /D "%~dp0"
 

From d6643bb4bc0205eb2de6a226b528b03609e8a786 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 9 Mar 2026 12:30:43 -0700
Subject: [PATCH 015/210] One-click installer: Optimize wheel downloads to only
 re-download changed wheels

---
 one_click.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/one_click.py b/one_click.py
index cbdeadca..60c6cb6f 100644
--- a/one_click.py
+++ b/one_click.py
@@ -365,8 +365,10 @@ def update_requirements(initial_installation=False, pull=True):
 
     current_commit = get_current_commit()
     wheels_changed = not os.path.exists(state_file)
+    installed_wheels = set()
     if not wheels_changed:
         state = load_state()
+        installed_wheels = set(state.get('installed_wheels', []))
         if 'wheels_changed' in state or state.get('last_installed_commit') != current_commit:
             wheels_changed = True
 
@@ -431,9 +433,17 @@ def update_requirements(initial_installation=False, pull=True):
 
     # Prepare the requirements file
     textgen_requirements = open(requirements_file).read().splitlines()
+    all_whl_lines = [line.strip() for line in textgen_requirements if '.whl' in line]
 
-    if not initial_installation and not wheels_changed:
-        textgen_requirements = [line for line in textgen_requirements if '.whl' not in line]
+    if not initial_installation:
+        if installed_wheels:
+            # Per-wheel comparison: only re-download wheels that changed
+            textgen_requirements = [
+                line for line in textgen_requirements
+                if '.whl' not in line or line.strip() not in installed_wheels
+            ]
+        elif not wheels_changed:
+            textgen_requirements = [line for line in textgen_requirements if '.whl' not in line]
 
     with open('temp_requirements.txt', 'w') as file:
         file.write('\n'.join(textgen_requirements))
@@ -452,6 +462,7 @@ def update_requirements(initial_installation=False, pull=True):
     # Save state after successful installation
     state = load_state()
     state['last_installed_commit'] = current_commit
+    state['installed_wheels'] = all_whl_lines
     state.pop('wheels_changed', None)
     save_state(state)
 

From 970055ca00d896293c603be1fd25d14d6e4e76e4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 9 Mar 2026 17:08:31 -0300
Subject: [PATCH 016/210] Update Intel GPU support to use native PyTorch XPU
 wheels

PyTorch 2.9+ includes native XPU support, making
intel-extension-for-pytorch and the separate oneAPI conda
install unnecessary.

Closes #7308
---
 one_click.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/one_click.py b/one_click.py
index 60c6cb6f..189d81b8 100644
--- a/one_click.py
+++ b/one_click.py
@@ -119,10 +119,7 @@ def get_pytorch_install_command(gpu_choice):
     elif gpu_choice in ["APPLE", "NONE"]:
         return base_cmd + "--index-url https://download.pytorch.org/whl/cpu"
     elif gpu_choice == "INTEL":
-        if is_linux():
-            return "python -m pip install torch==2.1.0a0 intel-extension-for-pytorch==2.1.10+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
-        else:
-            return "python -m pip install torch==2.1.0a0 intel-extension-for-pytorch==2.1.10 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
+        return base_cmd + "--index-url https://download.pytorch.org/whl/xpu"
     else:
         return base_cmd
 
@@ -138,8 +135,7 @@ def get_pytorch_update_command(gpu_choice):
     elif gpu_choice in ["APPLE", "NONE"]:
         return f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
     elif gpu_choice == "INTEL":
-        intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
-        return f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
+        return f"{base_cmd} --index-url https://download.pytorch.org/whl/xpu"
     else:
         return base_cmd
 
@@ -316,13 +312,6 @@ def install_webui():
     install_pytorch = get_pytorch_install_command(gpu_choice)
     run_cmd(f"conda install -y ninja git && {install_pytorch}", assert_success=True, environment=True)
 
-    if gpu_choice == "INTEL":
-        # Install oneAPI dependencies via conda
-        print_big_message("Installing Intel oneAPI runtime libraries.")
-        run_cmd("conda install -y -c https://software.repos.intel.com/python/conda/ -c conda-forge dpcpp-cpp-rt=2024.0 mkl-dpcpp=2024.0", environment=True)
-        # Install libuv required by Intel-patched torch
-        run_cmd("conda install -y libuv", environment=True)
-
     # Install the webui requirements
     update_requirements(initial_installation=True, pull=False)
 

From 39e6c997cc480818911c7f3644bbcd306eca932e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 9 Mar 2026 19:29:24 -0700
Subject: [PATCH 017/210] Refactor to not import gradio in `--nowebui` mode

---
 extensions/openai/models.py |   4 +-
 modules/chat.py             | 122 +++++++-----------------------------
 modules/extensions.py       |   4 +-
 modules/loaders.py          |  57 ++++++++++++++++-
 modules/models_settings.py  | 109 ++++++++++++++++++++++++++++++--
 modules/ui.py               |  54 +---------------
 server.py                   |  91 +++++++++++++--------------
 7 files changed, 232 insertions(+), 209 deletions(-)

diff --git a/extensions/openai/models.py b/extensions/openai/models.py
index d6ef119d..82c65093 100644
--- a/extensions/openai/models.py
+++ b/extensions/openai/models.py
@@ -1,4 +1,4 @@
-from modules import shared, ui
+from modules import loaders, shared
 from modules.logging_colors import logger
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
@@ -50,7 +50,7 @@ def _load_model(data):
     # parameters exposed in the UI. Never allow security-sensitive
     # flags like trust_remote_code or extra_flags to be set via the API.
     blocked_keys = {'extra_flags'}
-    allowed_keys = set(ui.list_model_elements()) - blocked_keys
+    allowed_keys = set(loaders.list_model_elements()) - blocked_keys
     if args:
         for k in args:
             if k in allowed_keys and hasattr(shared.args, k):
diff --git a/modules/chat.py b/modules/chat.py
index 36d373d6..62d1492d 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -11,7 +11,6 @@ from datetime import datetime
 from functools import partial
 from pathlib import Path
 
-import gradio as gr
 import yaml
 from jinja2.ext import loopcontrols
 from jinja2.sandbox import ImmutableSandboxedEnvironment
@@ -1333,6 +1332,7 @@ def load_history_after_deletion(state, idx):
     Loads the latest history for the given character in chat or chat-instruct
     mode, or the latest instruct history for instruct mode.
     '''
+    import gradio as gr
 
     if shared.args.multi_user:
         return start_new_chat(state)
@@ -1351,6 +1351,7 @@ def load_history_after_deletion(state, idx):
 
 
 def update_character_menu_after_deletion(idx):
+    import gradio as gr
     characters = utils.get_available_characters()
     idx = min(int(idx), len(characters) - 1)
     idx = max(0, idx)
@@ -1565,24 +1566,6 @@ def clear_character_for_ui(state):
     return state, state['name2'], state['context'], state['greeting'], None
 
 
-def load_instruction_template(template):
-    if template == 'None':
-        return ''
-
-    for filepath in [shared.user_data_dir / 'instruction-templates' / f'{template}.yaml', shared.user_data_dir / 'instruction-templates' / 'Alpaca.yaml']:
-        if filepath.exists():
-            break
-    else:
-        return ''
-
-    file_contents = open(filepath, 'r', encoding='utf-8').read()
-    data = yaml.safe_load(file_contents)
-    if 'instruction_template' in data:
-        return data['instruction_template']
-    else:
-        return jinja_template_from_old_format(data)
-
-
 @functools.cache
 def load_character_memoized(character, name1, name2):
     return load_character(character, name1, name2)
@@ -1590,10 +1573,12 @@ def load_character_memoized(character, name1, name2):
 
 @functools.cache
 def load_instruction_template_memoized(template):
+    from modules.models_settings import load_instruction_template
     return load_instruction_template(template)
 
 
 def upload_character(file, img_path, tavern=False):
+    import gradio as gr
     img = open_image_safely(img_path)
     decoded_file = file if isinstance(file, str) else file.decode('utf-8')
     try:
@@ -1647,6 +1632,7 @@ def upload_tavern_character(img_path, _json):
 
 
 def check_tavern_character(img_path):
+    import gradio as gr
     img = open_image_safely(img_path)
 
     if img is None:
@@ -1832,6 +1818,7 @@ def delete_user(name):
 
 def update_user_menu_after_deletion(idx):
     """Update user menu after a user is deleted"""
+    import gradio as gr
     users = get_available_users()
     if len(users) == 0:
         # Create a default user if none exist
@@ -1864,93 +1851,13 @@ def handle_user_menu_change(state):
 
 def handle_save_user_click(name1):
     """Handle save user button click"""
+    import gradio as gr
     return [
         name1,
         gr.update(visible=True)
     ]
 
 
-def jinja_template_from_old_format(params, verbose=False):
-    MASTER_TEMPLATE = """
-{%- set ns = namespace(found=false) -%}
-{%- for message in messages -%}
-    {%- if message['role'] == 'system' -%}
-        {%- set ns.found = true -%}
-    {%- endif -%}
-{%- endfor -%}
-{%- if not ns.found -%}
-    {{- '<|PRE-SYSTEM|>' + '<|SYSTEM-MESSAGE|>' + '<|POST-SYSTEM|>' -}}
-{%- endif %}
-{%- for message in messages %}
-    {%- if message['role'] == 'system' -%}
-        {{- '<|PRE-SYSTEM|>' + message['content'] + '<|POST-SYSTEM|>' -}}
-    {%- else -%}
-        {%- if message['role'] == 'user' -%}
-            {{-'<|PRE-USER|>' + message['content'] + '<|POST-USER|>'-}}
-        {%- else -%}
-            {{-'<|PRE-ASSISTANT|>' + message['content'] + '<|POST-ASSISTANT|>' -}}
-        {%- endif -%}
-    {%- endif -%}
-{%- endfor -%}
-{%- if add_generation_prompt -%}
-    {{-'<|PRE-ASSISTANT-GENERATE|>'-}}
-{%- endif -%}
-"""
-
-    if 'context' in params and '<|system-message|>' in params['context']:
-        pre_system = params['context'].split('<|system-message|>')[0]
-        post_system = params['context'].split('<|system-message|>')[1]
-    else:
-        pre_system = ''
-        post_system = ''
-
-    pre_user = params['turn_template'].split('<|user-message|>')[0].replace('<|user|>', params['user'])
-    post_user = params['turn_template'].split('<|user-message|>')[1].split('<|bot|>')[0]
-
-    pre_assistant = '<|bot|>' + params['turn_template'].split('<|bot-message|>')[0].split('<|bot|>')[1]
-    pre_assistant = pre_assistant.replace('<|bot|>', params['bot'])
-    post_assistant = params['turn_template'].split('<|bot-message|>')[1]
-
-    def preprocess(string):
-        return string.replace('\n', '\\n').replace('\'', '\\\'')
-
-    pre_system = preprocess(pre_system)
-    post_system = preprocess(post_system)
-    pre_user = preprocess(pre_user)
-    post_user = preprocess(post_user)
-    pre_assistant = preprocess(pre_assistant)
-    post_assistant = preprocess(post_assistant)
-
-    if verbose:
-        print(
-            '\n',
-            repr(pre_system) + '\n',
-            repr(post_system) + '\n',
-            repr(pre_user) + '\n',
-            repr(post_user) + '\n',
-            repr(pre_assistant) + '\n',
-            repr(post_assistant) + '\n',
-        )
-
-    result = MASTER_TEMPLATE
-    if 'system_message' in params:
-        result = result.replace('<|SYSTEM-MESSAGE|>', preprocess(params['system_message']))
-    else:
-        result = result.replace('<|SYSTEM-MESSAGE|>', '')
-
-    result = result.replace('<|PRE-SYSTEM|>', pre_system)
-    result = result.replace('<|POST-SYSTEM|>', post_system)
-    result = result.replace('<|PRE-USER|>', pre_user)
-    result = result.replace('<|POST-USER|>', post_user)
-    result = result.replace('<|PRE-ASSISTANT|>', pre_assistant)
-    result = result.replace('<|PRE-ASSISTANT-GENERATE|>', pre_assistant.rstrip(' '))
-    result = result.replace('<|POST-ASSISTANT|>', post_assistant)
-
-    result = result.strip()
-
-    return result
-
-
 def my_yaml_output(data):
     '''
     pyyaml is very inconsistent with multiline strings.
@@ -2002,6 +1909,7 @@ def handle_unique_id_select(state):
 
 
 def handle_start_new_chat_click(state):
+    import gradio as gr
     history = start_new_chat(state)
     histories = find_all_histories_with_first_prompts(state)
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
@@ -2017,6 +1925,7 @@ def handle_start_new_chat_click(state):
 
 
 def handle_delete_chat_confirm_click(state):
+    import gradio as gr
     filtered_histories = find_all_histories_with_first_prompts(state)
     filtered_ids = [h[1] for h in filtered_histories]
     index = str(filtered_ids.index(state['unique_id']))
@@ -2037,6 +1946,7 @@ def handle_delete_chat_confirm_click(state):
 
 
 def handle_branch_chat_click(state):
+    import gradio as gr
     branch_from_index = state['branch_index']
     if branch_from_index == -1:
         history = state['history']
@@ -2148,6 +2058,7 @@ def handle_navigate_version_click(state):
 
 
 def handle_rename_chat_click():
+    import gradio as gr
     return [
         gr.update(value="My New Chat"),
         gr.update(visible=True),
@@ -2155,6 +2066,7 @@ def handle_rename_chat_click():
 
 
 def handle_rename_chat_confirm(rename_to, state):
+    import gradio as gr
     rename_history(state['unique_id'], rename_to, state['character_menu'], state['mode'])
     histories = find_all_histories_with_first_prompts(state)
 
@@ -2165,11 +2077,13 @@ def handle_rename_chat_confirm(rename_to, state):
 
 
 def handle_search_chat_change(state):
+    import gradio as gr
     histories = find_all_histories_with_first_prompts(state)
     return gr.update(choices=histories)
 
 
 def handle_upload_chat_history(load_chat_history, state):
+    import gradio as gr
     history = start_new_chat(state)
     history = load_history_json(load_chat_history, history)
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
@@ -2192,6 +2106,7 @@ def handle_upload_chat_history(load_chat_history, state):
 
 
 def handle_character_menu_change(state):
+    import gradio as gr
     name1, name2, picture, greeting, context = load_character(state['character_menu'], state['name1'], state['name2'])
 
     state['name1'] = name1
@@ -2244,6 +2159,7 @@ def handle_character_picture_change(picture_path):
 
 
 def handle_mode_change(state):
+    import gradio as gr
     history, loaded_unique_id = load_latest_history(state)
     histories = find_all_histories_with_first_prompts(state)
 
@@ -2270,6 +2186,7 @@ def handle_mode_change(state):
 
 
 def handle_save_character_click(name2):
+    import gradio as gr
     return [
         name2,
         gr.update(visible=True)
@@ -2277,6 +2194,7 @@ def handle_save_character_click(name2):
 
 
 def handle_load_template_click(instruction_template):
+    from modules.models_settings import load_instruction_template
     output = load_instruction_template(instruction_template)
     return [
         output,
@@ -2285,6 +2203,7 @@ def handle_load_template_click(instruction_template):
 
 
 def handle_save_template_click(instruction_template_str):
+    import gradio as gr
     contents = generate_instruction_template_yaml(instruction_template_str)
     return [
         "My Template.yaml",
@@ -2295,6 +2214,7 @@ def handle_save_template_click(instruction_template_str):
 
 
 def handle_delete_template_click(template):
+    import gradio as gr
     return [
         f"{template}.yaml",
         str(shared.user_data_dir / 'instruction-templates') + '/',
@@ -2310,6 +2230,7 @@ def handle_your_picture_change(picture, state):
 
 
 def handle_send_instruction_click(state):
+    import gradio as gr
     state['mode'] = 'instruct'
     state['history'] = {'internal': [], 'visible': [], 'metadata': {}}
 
@@ -2322,6 +2243,7 @@ def handle_send_instruction_click(state):
 
 
 def handle_send_chat_click(state):
+    import gradio as gr
     output = generate_chat_prompt("", state, _continue=True)
 
     if state["show_two_notebook_columns"]:
diff --git a/modules/extensions.py b/modules/extensions.py
index dd327882..e58a9a4c 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -6,8 +6,6 @@ from functools import partial
 from inspect import signature
 from pathlib import Path
 
-import gradio as gr
-
 import modules.shared as shared
 from modules.logging_colors import logger
 
@@ -214,6 +212,7 @@ def _apply_custom_js():
 
 
 def create_extensions_block():
+    import gradio as gr
     to_display = []
     for extension, name in iterator():
         if hasattr(extension, "ui") and not (hasattr(extension, 'params') and extension.params.get('is_tab', False)):
@@ -228,6 +227,7 @@ def create_extensions_block():
 
 
 def create_extensions_tabs():
+    import gradio as gr
     for extension, name in iterator():
         if hasattr(extension, "ui") and (hasattr(extension, 'params') and extension.params.get('is_tab', False)):
             display_name = getattr(extension, 'params', {}).get('display_name', name)
diff --git a/modules/loaders.py b/modules/loaders.py
index 64de3dde..22ee5ed9 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -1,8 +1,6 @@
 import functools
 from collections import OrderedDict
 
-import gradio as gr
-
 loaders_and_params = OrderedDict({
     'llama.cpp': [
         'gpu_layers',
@@ -276,6 +274,7 @@ def list_all_samplers():
 
 
 def blacklist_samplers(loader, dynamic_temperature):
+    import gradio as gr
     all_samplers = list_all_samplers()
     output = []
 
@@ -301,7 +300,61 @@ def get_all_params():
     return sorted(all_params)
 
 
+def list_model_elements():
+    return [
+        'filter_by_loader',
+        'loader',
+        'cpu_memory',
+        'gpu_layers',
+        'fit_target',
+        'cpu_moe',
+        'threads',
+        'threads_batch',
+        'batch_size',
+        'ubatch_size',
+        'ctx_size',
+        'cache_type',
+        'tensor_split',
+        'extra_flags',
+        'streaming_llm',
+        'gpu_split',
+        'alpha_value',
+        'rope_freq_base',
+        'compress_pos_emb',
+        'compute_dtype',
+        'quant_type',
+        'load_in_8bit',
+        'load_in_4bit',
+        'attn_implementation',
+        'cpu',
+        'disk',
+        'row_split',
+        'no_kv_offload',
+        'no_mmap',
+        'mlock',
+        'numa',
+        'parallel',
+        'use_double_quant',
+        'bf16',
+        'enable_tp',
+        'tp_backend',
+        'cfg_cache',
+        'no_use_fast',
+        'model_draft',
+        'draft_max',
+        'gpu_layers_draft',
+        'device_draft',
+        'ctx_size_draft',
+        'spec_type',
+        'spec_ngram_size_n',
+        'spec_ngram_size_m',
+        'spec_ngram_min_hits',
+        'mmproj',
+    ]
+
+
 def make_loader_params_visible(loader):
+    import gradio as gr
     params = []
     all_params = get_all_params()
     if loader in loaders_and_params:
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 472871ce..5e69b60e 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -4,10 +4,9 @@ import re
 from math import floor
 from pathlib import Path
 
-import gradio as gr
 import yaml
 
-from modules import chat, loaders, metadata_gguf, shared, ui
+from modules import loaders, metadata_gguf, shared
 from modules.logging_colors import logger
 from modules.utils import resolve_model_path
 
@@ -199,7 +198,7 @@ def get_model_metadata(model):
 
     # Load instruction template if defined by name rather than by value
     if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
-        model_settings['instruction_template_str'] = chat.load_instruction_template(model_settings['instruction_template'])
+        model_settings['instruction_template_str'] = load_instruction_template(model_settings['instruction_template'])
 
     return model_settings
 
@@ -228,7 +227,7 @@ def update_model_parameters(state, initial=False):
     '''
     UI: update the command-line arguments based on the interface values
     '''
-    elements = ui.list_model_elements()  # the names of the parameters
+    elements = loaders.list_model_elements()  # the names of the parameters
 
     for i, element in enumerate(elements):
         if element not in state:
@@ -248,6 +247,7 @@ def apply_model_settings_to_state(model, state):
     '''
     UI: update the state variable with the model settings
     '''
+    import gradio as gr
     model_settings = get_model_metadata(model)
     if 'loader' in model_settings:
         loader = model_settings.pop('loader')
@@ -290,7 +290,7 @@ def save_model_settings(model, state):
     if model_regex not in user_config:
         user_config[model_regex] = {}
 
-    for k in ui.list_model_elements():
+    for k in loaders.list_model_elements():
         if k == 'loader' or k in loaders.loaders_and_params[state['loader']]:
             user_config[model_regex][k] = state[k]
 
@@ -419,3 +419,102 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type):
 
     vram_usage = estimate_vram(model, gpu_layers, ctx_size, cache_type)
     return f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span></div>"
+
+
+def load_instruction_template(template):
+    if template == 'None':
+        return ''
+
+    for filepath in [shared.user_data_dir / 'instruction-templates' / f'{template}.yaml', shared.user_data_dir / 'instruction-templates' / 'Alpaca.yaml']:
+        if filepath.exists():
+            break
+    else:
+        return ''
+
+    file_contents = open(filepath, 'r', encoding='utf-8').read()
+    data = yaml.safe_load(file_contents)
+    if 'instruction_template' in data:
+        return data['instruction_template']
+    else:
+        return _jinja_template_from_old_format(data)
+
+
+def _jinja_template_from_old_format(params, verbose=False):
+    MASTER_TEMPLATE = """
+{%- set ns = namespace(found=false) -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'system' -%}
+        {%- set ns.found = true -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if not ns.found -%}
+    {{- '<|PRE-SYSTEM|>' + '<|SYSTEM-MESSAGE|>' + '<|POST-SYSTEM|>' -}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' -%}
+        {{- '<|PRE-SYSTEM|>' + message['content'] + '<|POST-SYSTEM|>' -}}
+    {%- else -%}
+        {%- if message['role'] == 'user' -%}
+            {{-'<|PRE-USER|>' + message['content'] + '<|POST-USER|>'-}}
+        {%- else -%}
+            {{-'<|PRE-ASSISTANT|>' + message['content'] + '<|POST-ASSISTANT|>' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{-'<|PRE-ASSISTANT-GENERATE|>'-}}
+{%- endif -%}
+"""
+
+    if 'context' in params and '<|system-message|>' in params['context']:
+        pre_system = params['context'].split('<|system-message|>')[0]
+        post_system = params['context'].split('<|system-message|>')[1]
+    else:
+        pre_system = ''
+        post_system = ''
+
+    pre_user = params['turn_template'].split('<|user-message|>')[0].replace('<|user|>', params['user'])
+    post_user = params['turn_template'].split('<|user-message|>')[1].split('<|bot|>')[0]
+
+    pre_assistant = '<|bot|>' + params['turn_template'].split('<|bot-message|>')[0].split('<|bot|>')[1]
+    pre_assistant = pre_assistant.replace('<|bot|>', params['bot'])
+    post_assistant = params['turn_template'].split('<|bot-message|>')[1]
+
+    def preprocess(string):
+        return string.replace('\n', '\\n').replace('\'', '\\\'')
+
+    pre_system = preprocess(pre_system)
+    post_system = preprocess(post_system)
+    pre_user = preprocess(pre_user)
+    post_user = preprocess(post_user)
+    pre_assistant = preprocess(pre_assistant)
+    post_assistant = preprocess(post_assistant)
+
+    if verbose:
+        print(
+            '\n',
+            repr(pre_system) + '\n',
+            repr(post_system) + '\n',
+            repr(pre_user) + '\n',
+            repr(post_user) + '\n',
+            repr(pre_assistant) + '\n',
+            repr(post_assistant) + '\n',
+        )
+
+    result = MASTER_TEMPLATE
+    if 'system_message' in params:
+        result = result.replace('<|SYSTEM-MESSAGE|>', preprocess(params['system_message']))
+    else:
+        result = result.replace('<|SYSTEM-MESSAGE|>', '')
+
+    result = result.replace('<|PRE-SYSTEM|>', pre_system)
+    result = result.replace('<|POST-SYSTEM|>', post_system)
+    result = result.replace('<|PRE-USER|>', pre_user)
+    result = result.replace('<|POST-USER|>', post_user)
+    result = result.replace('<|PRE-ASSISTANT|>', pre_assistant)
+    result = result.replace('<|PRE-ASSISTANT-GENERATE|>', pre_assistant.rstrip(' '))
+    result = result.replace('<|POST-ASSISTANT|>', post_assistant)
+
+    result = result.strip()
+
+    return result
diff --git a/modules/ui.py b/modules/ui.py
index 70e929f2..abbfde49 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -120,58 +120,8 @@ else:
 
 
 def list_model_elements():
-    elements = [
-        'filter_by_loader',
-        'loader',
-        'cpu_memory',
-        'gpu_layers',
-        'fit_target',
-        'cpu_moe',
-        'threads',
-        'threads_batch',
-        'batch_size',
-        'ubatch_size',
-        'ctx_size',
-        'cache_type',
-        'tensor_split',
-        'extra_flags',
-        'streaming_llm',
-        'gpu_split',
-        'alpha_value',
-        'rope_freq_base',
-        'compress_pos_emb',
-        'compute_dtype',
-        'quant_type',
-        'load_in_8bit',
-        'load_in_4bit',
-        'attn_implementation',
-        'cpu',
-        'disk',
-        'row_split',
-        'no_kv_offload',
-        'no_mmap',
-        'mlock',
-        'numa',
-        'parallel',
-        'use_double_quant',
-        'bf16',
-        'enable_tp',
-        'tp_backend',
-        'cfg_cache',
-        'no_use_fast',
-        'model_draft',
-        'draft_max',
-        'gpu_layers_draft',
-        'device_draft',
-        'ctx_size_draft',
-        'spec_type',
-        'spec_ngram_size_n',
-        'spec_ngram_size_m',
-        'spec_ngram_min_hits',
-        'mmproj',
-    ]
-
-    return elements
+    from modules.loaders import list_model_elements
+    return list_model_elements()
 
 
 def list_interface_input_elements():
diff --git a/server.py b/server.py
index ff2d1db2..73f190b6 100644
--- a/server.py
+++ b/server.py
@@ -1,58 +1,20 @@
-import os
-import shutil
-import warnings
-from pathlib import Path
-
-from modules import shared, ui  # ui must be imported early to avoid circular imports
-from modules.image_models import load_image_model
-from modules.logging_colors import logger
-from modules.prompts import load_prompt
-
-# Set up Gradio temp directory path
-gradio_temp_path = shared.user_data_dir / 'cache' / 'gradio'
-shutil.rmtree(gradio_temp_path, ignore_errors=True)
-gradio_temp_path.mkdir(parents=True, exist_ok=True)
-
-# Set environment variables
-os.environ.update({
-    'GRADIO_ANALYTICS_ENABLED': 'False',
-    'BITSANDBYTES_NOWELCOME': '1',
-    'GRADIO_TEMP_DIR': str(gradio_temp_path)
-})
-
-warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
-warnings.filterwarnings('ignore', category=UserWarning, message='Using the update method is deprecated')
-warnings.filterwarnings('ignore', category=UserWarning, message='Field "model_name" has conflict')
-warnings.filterwarnings('ignore', category=UserWarning, message='The value passed into gr.Dropdown()')
-warnings.filterwarnings('ignore', category=UserWarning, message='Field "model_names" has conflict')
-
-import gradio as gr
-
 import os
 import signal
 import sys
 import time
+import warnings
 from functools import partial
+from pathlib import Path
 from threading import Lock, Thread
 
 import yaml
 
+from modules import shared, utils
+from modules.image_models import load_image_model
+from modules.logging_colors import logger
+from modules.prompts import load_prompt
+
 import modules.extensions as extensions_module
-from modules import (
-    training,
-    ui,
-    ui_chat,
-    ui_default,
-    ui_file_saving,
-    ui_image_generation,
-    ui_model_menu,
-    ui_notebook,
-    ui_parameters,
-    ui_session,
-    utils
-)
-from modules.chat import generate_pfp_cache
-from modules.extensions import apply_extensions
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model_if_idle
 from modules.models_settings import (
@@ -61,7 +23,13 @@ from modules.models_settings import (
     update_model_parameters
 )
 from modules.shared import do_cmd_flags_warnings
-from modules.utils import gradio
+
+os.environ['BITSANDBYTES_NOWELCOME'] = '1'
+
+warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
+warnings.filterwarnings('ignore', category=UserWarning, message='Using the update method is deprecated')
+warnings.filterwarnings('ignore', category=UserWarning, message='Field "model_name" has conflict')
+warnings.filterwarnings('ignore', category=UserWarning, message='Field "model_names" has conflict')
 
 
 def signal_handler(sig, frame):
@@ -83,6 +51,37 @@ signal.signal(signal.SIGTERM, signal_handler)
 
 def create_interface():
 
+    import shutil
+
+    import gradio as gr
+
+    from modules import (
+        training,
+        ui,
+        ui_chat,
+        ui_default,
+        ui_file_saving,
+        ui_image_generation,
+        ui_model_menu,
+        ui_notebook,
+        ui_parameters,
+        ui_session,
+    )
+    from modules.chat import generate_pfp_cache
+    from modules.extensions import apply_extensions
+    from modules.utils import gradio
+
+    warnings.filterwarnings('ignore', category=UserWarning, message='The value passed into gr.Dropdown()')
+
+    # Set up Gradio temp directory path
+    gradio_temp_path = shared.user_data_dir / 'cache' / 'gradio'
+    shutil.rmtree(gradio_temp_path, ignore_errors=True)
+    gradio_temp_path.mkdir(parents=True, exist_ok=True)
+    os.environ.update({
+        'GRADIO_ANALYTICS_ENABLED': 'False',
+        'GRADIO_TEMP_DIR': str(gradio_temp_path)
+    })
+
     title = 'Text Generation Web UI'
 
     # Password authentication

From 7f485274ebc01364b19e45df1d3e27683f7f63a2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 9 Mar 2026 23:55:51 -0300
Subject: [PATCH 018/210] Fix ExLlamaV3 EOS handling, load order, and
 perplexity evaluation

- Use config.eos_token_id_list for all EOS tokens as stop conditions
  (fixes models like Llama-3 that define multiple EOS token IDs)
- Load vision/draft models before main model so autosplit accounts
  for their VRAM usage
- Fix loss computation in ExLlamav3_HF: use cache across chunks so
  sequences longer than 2048 tokens get correct perplexity values
---
 modules/exllamav3.py    | 39 ++++++++++++++++++++++-----------------
 modules/exllamav3_hf.py | 10 +++++++---
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 9ea38432..aeb68564 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -158,8 +158,21 @@ class Exllamav3Model:
             load_params['tensor_p'] = True
             load_params['tp_backend'] = shared.args.tp_backend
 
-        model.load(**load_params)
-        tokenizer = Tokenizer.from_config(config)
+        # Load vision and draft before the main model so autosplit
+        # accounts for their VRAM usage.
+
+        # Load vision model component (ExLlamaV3 native)
+        vision_model = None
+        if "vision_config" in config.config_dict:
+            logger.info("Vision component detected in model config. Attempting to load...")
+            try:
+                vision_model = Model.from_config(config, component="vision")
+                vision_model.load(progressbar=True)
+                logger.info("Vision model loaded successfully.")
+            except Exception as e:
+                logger.warning(f"Vision model loading failed (multimodal disabled): {e}")
+        else:
+            logger.info("No vision component in model config. Skipping multimodal setup.")
 
         # Initialize draft model for speculative decoding
         draft_model = None
@@ -185,18 +198,9 @@ class Exllamav3Model:
                 draft_model.load(**draft_load_params)
                 logger.info(f"Draft model loaded successfully. Max speculative tokens: {shared.args.draft_max}")
 
-        # Load vision model component (ExLlamaV3 native)
-        vision_model = None
-        if "vision_config" in config.config_dict:
-            logger.info("Vision component detected in model config. Attempting to load...")
-            try:
-                vision_model = Model.from_config(config, component="vision")
-                vision_model.load(progressbar=True)
-                logger.info("Vision model loaded successfully.")
-            except Exception as e:
-                logger.warning(f"Vision model loading failed (multimodal disabled): {e}")
-        else:
-            logger.info("No vision component in model config. Skipping multimodal setup.")
+        # Load main model last
+        model.load(**load_params)
+        tokenizer = Tokenizer.from_config(config)
 
         generator = Generator(
             model=model,
@@ -379,11 +383,12 @@ class Exllamav3Model:
         else:
             max_new_tokens = state['max_new_tokens']
 
-        # Get stop conditions
+        # Use full EOS token list from config (may contain multiple IDs)
         stop_conditions = []
         if not state['ban_eos_token']:
-            if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
-                stop_conditions.append(self.tokenizer.eos_token_id)
+            for eos_id in self.config.eos_token_id_list:
+                if eos_id is not None:
+                    stop_conditions.append(eos_id)
 
         seed = state.get('seed', -1)
         job = Job(
diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index b4b6ad20..d3c1cb90 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -201,19 +201,23 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
                 }
             ).to(input_ids.device).float()
         else:
-            # When processing with labels, handle as a complete sequence
-            # Process in chunks if the number of tokens is large
+            # Labels path: use cache for cross-chunk attention.
             tokens_to_process = seq_tensor
             all_logits = None
+            current_len = 0
 
             for i in range(0, tokens_to_process.shape[0], max_chunk_size):
                 chunk = tokens_to_process[i:i + max_chunk_size]
                 chunk_logits = self.ex_model.forward(
                     input_ids=chunk.view(1, -1),
                     params={
-                        "attn_mode": "flash_attn_nc",
+                        "attn_mode": "flash_attn",
+                        "cache": ex_cache,
+                        "past_len": current_len,
+                        "batch_shape": (1, self.max_tokens),
                     }
                 ).float()
+                current_len += chunk.shape[0]
 
                 if all_logits is None:
                     all_logits = chunk_logits

From 83b7e47d77c74da1c9edd7b242c3d89dd708e39e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 9 Mar 2026 20:12:54 -0700
Subject: [PATCH 019/210] Update README

---
 README.md | 56 +++++++++++++++++--------------------------------------
 1 file changed, 17 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index 7831ef65..d5d0a3ce 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 
 # Text Generation Web UI
 
-A Gradio web UI for running Large Language Models locally. 100% private, offline, and free.
+Run large language models locally with full privacy. Supports text generation, vision, image generation, training, tool-calling, and more — across multiple backends including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). 100% offline, zero telemetry.
 
 [Try the Deep Reason extension](https://oobabooga.gumroad.com/l/deep_reason)
 
@@ -23,22 +23,21 @@ A Gradio web UI for running Large Language Models locally. 100% private, offline
 
 ## Features
 
-- Supports multiple local text generation backends, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
-- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
+- **Multiple backends**: [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Switch between backends and models without restarting.
+- **Vision (multimodal)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
+- **Image generation**: A dedicated tab for `diffusers` models like **Z-Image-Turbo**. Features 4-bit/8-bit quantization and a persistent gallery with metadata ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Image-Generation-Tutorial)).
+- **Training**: Fine-tune LoRAs on multi-turn chat or raw text datasets. Supports resuming interrupted runs ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/05-%E2%80%90-Training-Tab)).
+- **OpenAI-compatible API**: Chat and Completions endpoints with tool-calling support — use as a local drop-in replacement for the OpenAI API ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
+- **Web search**: Search the internet with LLM-generated queries to add context to conversations.
+- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
+- **Easy setup**: [Portable builds](https://github.com/oobabooga/text-generation-webui/releases) (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or a one-click installer for the full feature set.
 - 100% offline and private, with zero telemetry, external resources, or remote update requests.
 - `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters. Prompts are automatically formatted with Jinja2 templates.
-- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
-- **Vision (multimodal models)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
-- **Image generation**: A dedicated tab for `diffusers` models like **Z-Image-Turbo**. Features 4-bit/8-bit quantization and a persistent gallery with metadata ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Image-Generation-Tutorial)).
-- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
-- **Training**: Fine-tune LoRAs on multi-turn chat or raw text datasets. Easy to use, good defaults, and supports resuming interrupted runs ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/05-%E2%80%90-Training-Tab)).
 - Edit messages, navigate between message versions, and branch conversations at any point.
-- Switch between different models in the UI without restarting.
 - Free-form text generation in the Notebook tab without being limited to chat turns.
 - Multiple sampling parameters and generation options for sophisticated text generation control.
 - Aesthetic UI with dark and light themes.
 - Syntax highlighting for code blocks and LaTeX rendering for mathematical expressions.
-- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 
 ## How to install
@@ -430,22 +429,14 @@ API generation defaults:
 
 ## Downloading models
 
-Models should be placed in the folder `text-generation-webui/user_data/models`. They are usually downloaded from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads&search=gguf).
+Download a GGUF model file from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads&search=gguf) and place it in the `user_data/models` folder. That's it — the UI will detect it automatically. You can also download models directly from the Model tab in the UI.
 
-To check if a GGUF model will fit in your hardware before downloading it, you can use this tool I created:
+Not sure what will fit your GPU? Use the [VRAM Calculator](https://huggingface.co/spaces/oobabooga/accurate-gguf-vram-calculator).
 
-[Accurate GGUF VRAM Calculator](https://huggingface.co/spaces/oobabooga/accurate-gguf-vram-calculator)
+<details>
+<summary>Other model types (Transformers, EXL3)</summary>
 
-* GGUF models are a single file and should be placed directly into `user_data/models`. Example:
-
-```
-text-generation-webui
-└── user_data
-    └── models
-        └── llama-2-13b-chat.Q4_K_M.gguf
-```
-
-* The remaining model types (like 16-bit Transformers models and EXL3 models) are made of several files and must be placed in a subfolder. Example:
+Models that consist of multiple files (like 16-bit Transformers models and EXL3 models) should be placed in a subfolder inside `user_data/models`:
 
 ```
 text-generation-webui
@@ -455,31 +446,18 @@ text-generation-webui
             ├── config.json
             ├── generation_config.json
             ├── model-00001-of-00004.safetensors
-            ├── model-00002-of-00004.safetensors
-            ├── model-00003-of-00004.safetensors
-            ├── model-00004-of-00004.safetensors
-            ├── model.safetensors.index.json
-            ├── special_tokens_map.json
+            ├── ...
             ├── tokenizer_config.json
             └── tokenizer.json
 ```
 
-In both cases, you can use the "Model" tab of the UI to download the model from Hugging Face automatically. It is also possible to download it via the command-line with:
-
-```
-python download-model.py organization/model
-```
-
-Run `python download-model.py --help` to see all the options.
+These formats require the one-click installer (not the portable build).
+</details>
 
 ## Documentation
 
 https://github.com/oobabooga/text-generation-webui/wiki
 
-## Google Colab notebook
-
-https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb
-
 ## Community
 
 https://www.reddit.com/r/Oobabooga/

From 3b7193265812dba17ab757b3853b78dae3d9ba2e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 9 Mar 2026 20:18:09 -0700
Subject: [PATCH 020/210] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d5d0a3ce..8dc09f32 100644
--- a/README.md
+++ b/README.md
@@ -429,7 +429,7 @@ API generation defaults:
 
 ## Downloading models
 
-Download a GGUF model file from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads&search=gguf) and place it in the `user_data/models` folder. That's it — the UI will detect it automatically. You can also download models directly from the Model tab in the UI.
+Download a GGUF model file from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads&search=gguf) and place it in the `user_data/models` folder. That's it — the UI will detect it automatically.
 
 Not sure what will fit your GPU? Use the [VRAM Calculator](https://huggingface.co/spaces/oobabooga/accurate-gguf-vram-calculator).
 

From 15792c3cb8095c788c6737fdced34ccac451aff6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 9 Mar 2026 20:31:05 -0700
Subject: [PATCH 021/210] Update ExLlamaV3 to 0.0.24

---
 requirements/full/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index eaf34fa8..a3d4d1e6 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -42,7 +42,7 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.23/exllamav3-0.0.23+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.23/exllamav3-0.0.23+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.24/exllamav3-0.0.24+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.24/exllamav3-0.0.24+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"

From c604ca66de4a25fc64b6c452ffb0cc611ead3d0a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 9 Mar 2026 21:36:04 -0700
Subject: [PATCH 022/210] Update the --multi-user warning

---
 modules/shared.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 080874ec..50dfc7ab 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -47,7 +47,7 @@ parser = argparse.ArgumentParser(description="Text Generation Web UI", conflict_
 # Basic settings
 group = parser.add_argument_group('Basic settings')
 group.add_argument('--user-data-dir', type=str, default=str(user_data_dir), help='Path to the user data directory. Default: auto-detected.')
-group.add_argument('--multi-user', action='store_true', help='Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.')
+group.add_argument('--multi-user', action='store_true', help='Multi-user mode. Chat histories are not saved or automatically loaded. Best suited for small trusted teams.')
 group.add_argument('--model', type=str, help='Name of the model to load by default.')
 group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
 group.add_argument('--model-dir', type=str, default=str(user_data_dir / 'models'), help='Path to directory with all the models.')
@@ -396,8 +396,15 @@ def do_cmd_flags_warnings():
             logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.")
         if any((args.listen, args.share)) and not any((args.gradio_auth, args.gradio_auth_path)):
             logger.warning("\nYou are potentially exposing the web UI to the entire internet without any access password.\nYou can create one with the \"--gradio-auth\" flag like this:\n\n--gradio-auth username:password\n\nMake sure to replace username:password with your own.")
-            if args.multi_user:
-                logger.warning('\nThe multi-user mode is highly experimental and should not be shared publicly.')
+    if args.multi_user:
+        logger.warning(
+            'Multi-user mode is enabled. Known limitations:'
+            '\n- The Stop button stops generation for all users, not just you.'
+            '\n- Chat history is not saved and will be lost on page refresh.'
+            '\n- Only one user can generate at a time unless using a parallel-capable backend (e.g. llama.cpp with --parallel N for N > 1, or ExLlamaV3).'
+            '\n\nThis mode works best for small trusted teams.'
+            '\n\nDo not expose publicly. Grayed-out actions can easily be bypassed client-side.\n'
+        )
 
 
 def apply_image_model_cli_overrides():

From 307c085d1b7b3c0f60d40276c50f277b2ed1a0f4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 9 Mar 2026 21:44:53 -0700
Subject: [PATCH 023/210] Minor warning change

---
 modules/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/shared.py b/modules/shared.py
index 50dfc7ab..dbd805a1 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -395,7 +395,7 @@ def do_cmd_flags_warnings():
         if args.share:
             logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.")
         if any((args.listen, args.share)) and not any((args.gradio_auth, args.gradio_auth_path)):
-            logger.warning("\nYou are potentially exposing the web UI to the entire internet without any access password.\nYou can create one with the \"--gradio-auth\" flag like this:\n\n--gradio-auth username:password\n\nMake sure to replace username:password with your own.")
+            logger.warning("You are potentially exposing the web UI to the entire internet without any access password.\nYou can create one with the \"--gradio-auth\" flag like this:\n\n--gradio-auth username:password\n\nMake sure to replace username:password with your own.")
     if args.multi_user:
         logger.warning(
             'Multi-user mode is enabled. Known limitations:'

From 6ec4ca8b102725d2ebb810c07ad668179835a567 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 10 Mar 2026 09:56:07 -0300
Subject: [PATCH 024/210] Add missing custom_token_bans to llama.cpp and
 reasoning_effort to ExLlamav3

---
 modules/loaders.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/loaders.py b/modules/loaders.py
index 22ee5ed9..d2ebdbc3 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -205,6 +205,7 @@ loaders_samplers = {
         'ban_eos_token',
         'add_bos_token',
         'enable_thinking',
+        'reasoning_effort',
         'seed',
         'skip_special_tokens',
     },
@@ -241,6 +242,7 @@ loaders_samplers = {
         'reasoning_effort',
         'seed',
         'sampler_priority',
+        'custom_token_bans',
         'dry_sequence_breakers',
         'grammar_string',
         'grammar_file_row',

From 8aeaa763659edc55a0159658e5eafd72552344e5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:41:15 -0300
Subject: [PATCH 025/210] Forward logit_bias, logprobs, and n to llama.cpp
 backend

- Forward logit_bias and logprobs natively to llama.cpp
- Support n>1 completions with seed increment for diversity
- Fix logprobs returning empty dict when not requested
---
 extensions/openai/completions.py | 96 +++++++++++++++++++++++---------
 extensions/openai/script.py      |  6 ++
 extensions/openai/typing.py      |  2 +-
 modules/llama_cpp_server.py      | 20 ++++++-
 4 files changed, 94 insertions(+), 30 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 8ba031c1..d70e69e6 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -37,6 +37,24 @@ def load_chat_template_file(filepath):
     return text
 
 
+def get_logprobs_from_llama_cpp():
+    """Read logprobs captured from the llama.cpp server response."""
+    if not hasattr(shared.model, 'last_completion_probabilities') or not shared.model.last_completion_probabilities:
+        return None
+
+    # Convert llama.cpp format to {token: logprob} dict
+    # llama.cpp returns: [{"token": "text", "logprob": -0.5, "top_logprobs": [{"token": "t", "logprob": -0.1}, ...]}, ...]
+    result = {}
+    for entry in shared.model.last_completion_probabilities:
+        top = entry.get('top_logprobs', entry.get('top_probs', []))
+        for item in top:
+            token = item.get('token', '')
+            logprob = item.get('logprob', item.get('prob', 0))
+            result[token] = logprob
+
+    return result
+
+
 def convert_logprobs_to_tiktoken(model, logprobs):
     # more problems than it's worth.
     # try:
@@ -72,6 +90,7 @@ def process_parameters(body, is_legacy=False):
         elif isinstance(body['stop'], list):
             generate_params['custom_stopping_strings'] = body['stop']
 
+    # For llama.cpp, logit_bias and logprobs are forwarded natively via prepare_payload()
     if shared.args.loader != 'llama.cpp':
         from transformers import LogitsProcessorList
 
@@ -85,13 +104,10 @@ def process_parameters(body, is_legacy=False):
         if logit_bias:  # {str: float, ...}
             logits_processor = [LogitsBiasProcessor(logit_bias)]
 
-        logprobs = None  # coming to chat eventually
-        if 'logprobs' in body:
-            logprobs = body.get('logprobs', 0)  # maybe cap at topk? don't clamp 0-5.
+        logprobs = body.get('logprobs', None)
+        if logprobs is not None and logprobs > 0:
             generate_params['logprob_proc'] = LogprobProcessor(logprobs)
             logits_processor.extend([generate_params['logprob_proc']])
-        else:
-            logprobs = None
 
         if logits_processor:  # requires logits_processor support
             generate_params['logits_processor'] = LogitsProcessorList(logits_processor)
@@ -456,6 +472,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
             logger.info(f"Found {len(raw_images)} image(s) in request.")
             generate_params['raw_images'] = raw_images
 
+    n_completions = body.get('n', 1) or 1
+
     if not stream:
         prompt_arg = body[prompt_str]
 
@@ -469,6 +487,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
         resp_list_data = []
         total_completion_token_count = 0
         total_prompt_token_count = 0
+        choice_index = 0
 
         for idx, prompt in enumerate(prompt_arg, start=0):
             if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], int):
@@ -483,31 +502,46 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                         prompt = decode(prompt)[0]
 
             prefix = prompt if echo else ''
-
-            # generate reply #######################################
-            debug_msg({'prompt': prompt, 'generate_params': generate_params})
-            generator = generate_reply(prompt, generate_params, is_chat=False)
-            answer = ''
-
-            for a in generator:
-                answer = a
-
             token_count = len(encode(prompt)[0])
             total_prompt_token_count += token_count
-            completion_token_count = len(encode(answer)[0])
-            total_completion_token_count += completion_token_count
-            stop_reason = "stop"
-            if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
-                stop_reason = "length"
 
-            respi = {
-                "index": idx,
-                "finish_reason": stop_reason,
-                "text": prefix + answer + suffix,
-                "logprobs": {'top_logprobs': [logprob_proc.token_alternatives]} if logprob_proc else None,
-            }
+            original_seed = generate_params.get('seed', -1)
+            for _n in range(n_completions):
+                # Increment seed for each completion to ensure diversity (matches llama.cpp native behavior)
+                if original_seed >= 0:
+                    generate_params['seed'] = original_seed + _n
 
-            resp_list_data.extend([respi])
+                # generate reply #######################################
+                debug_msg({'prompt': prompt, 'generate_params': generate_params})
+                generator = generate_reply(prompt, generate_params, is_chat=False)
+                answer = ''
+
+                for a in generator:
+                    answer = a
+
+                completion_token_count = len(encode(answer)[0])
+                total_completion_token_count += completion_token_count
+                stop_reason = "stop"
+                if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+                    stop_reason = "length"
+
+                if logprob_proc:
+                    completion_logprobs = {'top_logprobs': [logprob_proc.token_alternatives]}
+                elif shared.args.loader == 'llama.cpp':
+                    llama_logprobs = get_logprobs_from_llama_cpp()
+                    completion_logprobs = {'top_logprobs': [llama_logprobs]} if llama_logprobs else None
+                else:
+                    completion_logprobs = None
+
+                respi = {
+                    "index": choice_index,
+                    "finish_reason": stop_reason,
+                    "text": prefix + answer + suffix,
+                    "logprobs": completion_logprobs,
+                }
+
+                resp_list_data.append(respi)
+                choice_index += 1
 
         resp = {
             "id": cmpl_id,
@@ -540,6 +574,14 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
 
         def text_streaming_chunk(content):
             # begin streaming
+            if logprob_proc:
+                chunk_logprobs = {'top_logprobs': [logprob_proc.token_alternatives]}
+            elif shared.args.loader == 'llama.cpp':
+                llama_logprobs = get_logprobs_from_llama_cpp()
+                chunk_logprobs = {'top_logprobs': [llama_logprobs]} if llama_logprobs else None
+            else:
+                chunk_logprobs = None
+
             chunk = {
                 "id": cmpl_id,
                 "object": object_type,
@@ -549,7 +591,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                     "index": 0,
                     "finish_reason": None,
                     "text": content,
-                    "logprobs": {'top_logprobs': [logprob_proc.token_alternatives]} if logprob_proc else None,
+                    "logprobs": chunk_logprobs,
                 }],
             }
 
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 7a13638d..94c7650f 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -119,6 +119,12 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
     is_legacy = "/generate" in path
 
     if request_data.stream:
+        if (request_data.n or 1) > 1:
+            return JSONResponse(
+                status_code=400,
+                content={"error": {"message": "n > 1 is not supported with streaming.", "type": "invalid_request_error", "param": "n", "code": None}}
+            )
+
         stop_event = threading.Event()
 
         async def generator():
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index e48b7b60..2156074b 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -109,7 +109,7 @@ class CompletionRequestParams(BaseModel):
     logit_bias: dict | None = None
     logprobs: int | None = None
     max_tokens: int | None = 512
-    n: int | None = Field(default=1, description="Unused parameter.")
+    n: int | None = Field(default=1, description="Number of completions to generate. Only supported without streaming.")
     presence_penalty: float | None = shared.args.presence_penalty
     stop: str | List[str] | None = None
     stream: bool | None = False
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 6f7cbd20..a3e431ac 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -133,9 +133,20 @@ class LlamaServer:
 
             payload["samplers"] = filtered_samplers
 
+        logit_bias = []
         if state['custom_token_bans']:
-            to_ban = [[int(token_id), False] for token_id in state['custom_token_bans'].split(',')]
-            payload["logit_bias"] = to_ban
+            logit_bias.extend([[int(token_id), False] for token_id in state['custom_token_bans'].split(',')])
+
+        if state.get('logit_bias'):
+            for token_id_str, bias in state['logit_bias'].items():
+                logit_bias.append([int(token_id_str), bias])
+
+        if logit_bias:
+            payload["logit_bias"] = logit_bias
+
+        n_probs = state.get('logprobs', 0)
+        if n_probs and n_probs > 0:
+            payload["n_probs"] = n_probs
 
         return payload
 
@@ -215,6 +226,7 @@ class LlamaServer:
                 response.raise_for_status()  # Raise an exception for HTTP errors
 
             full_text = ""
+            self.last_completion_probabilities = []
 
             # Process the streaming response
             stop_event = state.get('stop_event')
@@ -240,6 +252,10 @@ class LlamaServer:
                         full_text += data['content']
                         yield full_text
 
+                    # Capture logprobs if present
+                    if 'completion_probabilities' in data:
+                        self.last_completion_probabilities.extend(data['completion_probabilities'])
+
                     # Check if generation is complete
                     if data.get('stop', False):
                         break

From 3304b57bdf050a3d5dfed375c67a8cac1c4a0eaa Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 10 Mar 2026 11:03:00 -0300
Subject: [PATCH 026/210] Add native logit_bias and logprobs support for
 ExLlamav3

---
 extensions/openai/completions.py | 24 +++++------
 modules/exllamav3.py             | 70 ++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 12 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index d70e69e6..04e644d6 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -37,13 +37,13 @@ def load_chat_template_file(filepath):
     return text
 
 
-def get_logprobs_from_llama_cpp():
-    """Read logprobs captured from the llama.cpp server response."""
+def get_logprobs_from_backend():
+    """Read logprobs captured from llama.cpp or ExLlamav3 native backend."""
     if not hasattr(shared.model, 'last_completion_probabilities') or not shared.model.last_completion_probabilities:
         return None
 
-    # Convert llama.cpp format to {token: logprob} dict
-    # llama.cpp returns: [{"token": "text", "logprob": -0.5, "top_logprobs": [{"token": "t", "logprob": -0.1}, ...]}, ...]
+    # Both backends store data in shared.model.last_completion_probabilities
+    # Format: [{"top_logprobs": [{"token": "text", "logprob": -0.5}, ...]}, ...]
     result = {}
     for entry in shared.model.last_completion_probabilities:
         top = entry.get('top_logprobs', entry.get('top_probs', []))
@@ -90,8 +90,8 @@ def process_parameters(body, is_legacy=False):
         elif isinstance(body['stop'], list):
             generate_params['custom_stopping_strings'] = body['stop']
 
-    # For llama.cpp, logit_bias and logprobs are forwarded natively via prepare_payload()
-    if shared.args.loader != 'llama.cpp':
+    # For llama.cpp and ExLlamav3 native, logit_bias and logprobs are forwarded natively
+    if shared.args.loader not in ('llama.cpp', 'ExLlamav3'):
         from transformers import LogitsProcessorList
 
         from modules.transformers_loader import (
@@ -527,9 +527,9 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
 
                 if logprob_proc:
                     completion_logprobs = {'top_logprobs': [logprob_proc.token_alternatives]}
-                elif shared.args.loader == 'llama.cpp':
-                    llama_logprobs = get_logprobs_from_llama_cpp()
-                    completion_logprobs = {'top_logprobs': [llama_logprobs]} if llama_logprobs else None
+                elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
+                    backend_logprobs = get_logprobs_from_backend()
+                    completion_logprobs = {'top_logprobs': [backend_logprobs]} if backend_logprobs else None
                 else:
                     completion_logprobs = None
 
@@ -576,9 +576,9 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
             # begin streaming
             if logprob_proc:
                 chunk_logprobs = {'top_logprobs': [logprob_proc.token_alternatives]}
-            elif shared.args.loader == 'llama.cpp':
-                llama_logprobs = get_logprobs_from_llama_cpp()
-                chunk_logprobs = {'top_logprobs': [llama_logprobs]} if llama_logprobs else None
+            elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
+                backend_logprobs = get_logprobs_from_backend()
+                chunk_logprobs = {'top_logprobs': [backend_logprobs]} if backend_logprobs else None
             else:
                 chunk_logprobs = None
 
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index aeb68564..1c682e49 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -1,3 +1,4 @@
+import math
 import queue
 import threading
 import traceback
@@ -9,6 +10,7 @@ import torch
 from exllamav3 import Cache, Config, Generator, Model, Tokenizer
 from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
 from exllamav3.generator import Job
+from exllamav3.generator.filter import Filter
 from exllamav3.generator.sampler import (
     CustomSampler,
     SS_AdaptiveP,
@@ -36,6 +38,29 @@ except Exception:
     traceback.print_exc()
 
 
+class LogitBiasFilter(Filter):
+    """Filter subclass that applies a static additive logit bias mask."""
+
+    def __init__(self, tokenizer, logit_bias_dict):
+        super().__init__(tokenizer=tokenizer, trigger_token=None, prefix_str=None, eos_after_completed=False)
+        self.logit_bias_dict = logit_bias_dict
+        self._mask = None
+
+    def reset(self): pass
+    def accept_token(self, token): pass
+    def is_completed(self): return False
+    def use_background_worker(self): return False
+
+    def get_next_logit_mask(self):
+        if self._mask is None:
+            self._mask = torch.zeros((1, self.vocab_size), dtype=self.logits_dtype)
+            for token_id_str, bias in self.logit_bias_dict.items():
+                token_id = int(token_id_str)
+                if 0 <= token_id < self.vocab_size:
+                    self._mask[0, token_id] = bias
+        return self._mask
+
+
 class ConcurrentGenerator:
     def __init__(self, generator):
         self.generator = generator
@@ -98,6 +123,10 @@ class Exllamav3Model:
     def __init__(self):
         pass
 
+    @property
+    def device(self) -> torch.device:
+        return torch.device(0)
+
     @classmethod
     def from_pretrained(cls, path_to_model):
         path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
@@ -390,6 +419,16 @@ class Exllamav3Model:
                 if eos_id is not None:
                     stop_conditions.append(eos_id)
 
+        # Build filters for logit_bias (OpenAI API)
+        filters = []
+        logit_bias = state.get('logit_bias')
+        if logit_bias:
+            filters.append(LogitBiasFilter(self.tokenizer, logit_bias))
+
+        # Logprobs support (OpenAI API)
+        logprobs = state.get('logprobs', 0) or 0
+        return_top_tokens = logprobs if logprobs > 0 else 0
+
         seed = state.get('seed', -1)
         job = Job(
             input_ids=input_ids,
@@ -399,11 +438,15 @@ class Exllamav3Model:
             sampler=sampler,
             seed=seed if seed >= 0 else None,
             stop_conditions=stop_conditions if stop_conditions else None,
+            filters=filters if filters else None,
+            return_top_tokens=return_top_tokens,
+            return_probs=return_top_tokens > 0,
         )
 
         # Stream generation
         response_text = ""
         stop_event = state.get('stop_event')
+        self.last_completion_probabilities = []
 
         result_queue = self.parallel_generator.submit(job)
         try:
@@ -415,14 +458,41 @@ class Exllamav3Model:
                 except queue.Empty:
                     continue
                 if result is None or result.get("eos"):
+                    # Capture logprobs from the final eos result too
+                    if result is not None and return_top_tokens > 0:
+                        self._capture_logprobs(result)
                     break
                 chunk = result.get("text", "")
+
+                # Capture logprobs from streaming results
+                if return_top_tokens > 0:
+                    self._capture_logprobs(result)
+
                 if chunk:
                     response_text += chunk
                     yield response_text
         finally:
             self.parallel_generator.cancel(job)
 
+    def _capture_logprobs(self, result):
+        """Convert ExLlamav3 top-k token data to the shared logprobs format."""
+        top_k_tokens = result.get("top_k_tokens")
+        top_k_probs = result.get("top_k_probs")
+        if top_k_tokens is None or top_k_probs is None:
+            return
+
+        id_to_piece = self.tokenizer.get_id_to_piece_list(True)
+        # top_k_tokens shape: (batch, seq_len, k), top_k_probs same
+        for seq_idx in range(top_k_tokens.shape[1]):
+            entry = {"top_logprobs": []}
+            for k_idx in range(top_k_tokens.shape[2]):
+                token_id = top_k_tokens[0, seq_idx, k_idx].item()
+                prob = top_k_probs[0, seq_idx, k_idx].item()
+                token_str = id_to_piece[token_id] if token_id < len(id_to_piece) else f"<{token_id}>"
+                logprob = math.log(prob) if prob > 0 else float("-inf")
+                entry["top_logprobs"].append({"token": token_str, "logprob": logprob})
+            self.last_completion_probabilities.append(entry)
+
     def generate(self, prompt, state):
         output = ""
         for chunk in self.generate_with_streaming(prompt, state):

From f1cfeae37279fd26fe5d6cd1f1ea475818f55937 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 10 Mar 2026 20:55:49 -0700
Subject: [PATCH 027/210] API: Improve OpenAI spec compliance in streaming and
 non-streaming responses

---
 extensions/openai/completions.py | 89 ++++++++++++++++++++++++++------
 extensions/openai/typing.py      |  6 +++
 2 files changed, 80 insertions(+), 15 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 04e644d6..03c4b03e 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -310,28 +310,41 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     requested_model = generate_params.pop('model')
     logprob_proc = generate_params.pop('logprob_proc', None)
 
-    def chat_streaming_chunk(content, chunk_tool_calls=None):
+    def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False):
         # begin streaming
+        delta = {}
+        if include_role:
+            delta['role'] = 'assistant'
+            delta['refusal'] = None
+        if content is not None:
+            delta['content'] = content
+        if chunk_tool_calls:
+            delta['tool_calls'] = chunk_tool_calls
+
         chunk = {
             "id": cmpl_id,
             "object": object_type,
             "created": created_time,
             "model": shared.model_name,
+            "system_fingerprint": None,
             resp_list: [{
                 "index": 0,
                 "finish_reason": None,
-                "delta": {'role': 'assistant', 'content': content, 'tool_calls': chunk_tool_calls},
+                "delta": delta,
+                "logprobs": None,
             }],
         }
 
         if logprob_proc:  # not official for chat yet
             top_logprobs = convert_logprobs_to_tiktoken(model=requested_model, logprobs=logprob_proc.token_alternatives)
             chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
-        # else:
-        #    chunk[resp_list][0]["logprobs"] = None
 
         return chunk
 
+    # Check if usage should be included in streaming chunks per OpenAI spec
+    stream_options = body.get('stream_options')
+    include_usage = bool(stream_options) and bool(stream_options.get('include_usage') if isinstance(stream_options, dict) else getattr(stream_options, 'include_usage', False))
+
     # generate reply #######################################
     if prompt_only:
         prompt = generate_chat_prompt(user_input, generate_params, _continue=continue_)
@@ -339,7 +352,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         return
 
     if stream:
-        yield chat_streaming_chunk('')
+        chunk = chat_streaming_chunk('', include_role=True)
+        if include_usage:
+            chunk['usage'] = None
+        yield chunk
 
     generator = generate_chat_reply(
         user_input, generate_params, regenerate=False, _continue=continue_, loading_message=False)
@@ -372,6 +388,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
                 continue
 
             chunk = chat_streaming_chunk(new_content)
+            if include_usage:
+                chunk['usage'] = None
 
             seen_content = answer
             yield chunk
@@ -389,25 +407,42 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         stop_reason = "length"
 
     if stream:
-        chunk = chat_streaming_chunk('', tool_calls)
+        chunk = chat_streaming_chunk(chunk_tool_calls=tool_calls)
         chunk[resp_list][0]['finish_reason'] = stop_reason
-        chunk['usage'] = {
+        usage = {
             "prompt_tokens": token_count,
             "completion_tokens": completion_token_count,
             "total_tokens": token_count + completion_token_count
         }
 
-        yield chunk
+        if include_usage:
+            chunk['usage'] = None
+            yield chunk
+            # Separate usage-only chunk with choices: [] per OpenAI spec
+            yield {
+                "id": cmpl_id,
+                "object": object_type,
+                "created": created_time,
+                "model": shared.model_name,
+                "system_fingerprint": None,
+                resp_list: [],
+                "usage": usage
+            }
+        else:
+            chunk['usage'] = usage
+            yield chunk
     else:
         resp = {
             "id": cmpl_id,
             "object": object_type,
             "created": created_time,
             "model": shared.model_name,
+            "system_fingerprint": None,
             resp_list: [{
                 "index": 0,
                 "finish_reason": stop_reason,
-                "message": {"role": "assistant", "content": answer, **({"tool_calls": tool_calls} if tool_calls else {})},
+                "message": {"role": "assistant", "refusal": None, "content": answer, **({"tool_calls": tool_calls} if tool_calls else {})},
+                "logprobs": None,
             }],
             "usage": {
                 "prompt_tokens": token_count,
@@ -418,8 +453,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         if logprob_proc:  # not official for chat yet
             top_logprobs = convert_logprobs_to_tiktoken(model=requested_model, logprobs=logprob_proc.token_alternatives)
             resp[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
-        # else:
-        #     resp[resp_list][0]["logprobs"] = None
 
         yield resp
 
@@ -427,7 +460,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
 def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_event=None):
     object_type = 'text_completion'
     created_time = int(time.time())
-    cmpl_id = "conv-%d" % (int(time.time() * 1000000000))
+    cmpl_id = "cmpl-%d" % (int(time.time() * 1000000000))
     resp_list = 'data' if is_legacy else 'choices'
 
     prompt_str = 'context' if is_legacy else 'prompt'
@@ -548,6 +581,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
             "object": object_type,
             "created": created_time,
             "model": shared.model_name,
+            "system_fingerprint": None,
             resp_list: resp_list_data,
             "usage": {
                 "prompt_tokens": total_prompt_token_count,
@@ -572,6 +606,10 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
         prefix = prompt if echo else ''
         token_count = len(encode(prompt)[0])
 
+        # Check if usage should be included in streaming chunks per OpenAI spec
+        stream_options = body.get('stream_options')
+        include_usage = bool(stream_options) and bool(stream_options.get('include_usage') if isinstance(stream_options, dict) else getattr(stream_options, 'include_usage', False))
+
         def text_streaming_chunk(content):
             # begin streaming
             if logprob_proc:
@@ -587,6 +625,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                 "object": object_type,
                 "created": created_time,
                 "model": shared.model_name,
+                "system_fingerprint": None,
                 resp_list: [{
                     "index": 0,
                     "finish_reason": None,
@@ -597,7 +636,10 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
 
             return chunk
 
-        yield text_streaming_chunk(prefix)
+        chunk = text_streaming_chunk(prefix)
+        if include_usage:
+            chunk['usage'] = None
+        yield chunk
 
         # generate reply #######################################
         debug_msg({'prompt': prompt, 'generate_params': generate_params})
@@ -617,6 +659,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
 
             seen_content = answer
             chunk = text_streaming_chunk(new_content)
+            if include_usage:
+                chunk['usage'] = None
             yield chunk
 
         completion_token_count = len(encode(answer)[0])
@@ -626,13 +670,28 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
 
         chunk = text_streaming_chunk(suffix)
         chunk[resp_list][0]["finish_reason"] = stop_reason
-        chunk["usage"] = {
+        usage = {
             "prompt_tokens": token_count,
             "completion_tokens": completion_token_count,
             "total_tokens": token_count + completion_token_count
         }
 
-        yield chunk
+        if include_usage:
+            chunk['usage'] = None
+            yield chunk
+            # Separate usage-only chunk with choices: [] per OpenAI spec
+            yield {
+                "id": cmpl_id,
+                "object": object_type,
+                "created": created_time,
+                "model": shared.model_name,
+                "system_fingerprint": None,
+                resp_list: [],
+                "usage": usage
+            }
+        else:
+            chunk["usage"] = usage
+            yield chunk
 
 
 def chat_completions(body: dict, is_legacy: bool = False, stop_event=None) -> dict:
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 2156074b..078bd201 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -99,6 +99,10 @@ class ToolCall(BaseModel):
     function: FunctionCall
 
 
+class StreamOptions(BaseModel):
+    include_usage: bool | None = False
+
+
 class CompletionRequestParams(BaseModel):
     model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
     prompt: str | List[str] | None = Field(default=None, description="Text prompt for completion. Can also use 'messages' format for multimodal.")
@@ -113,6 +117,7 @@ class CompletionRequestParams(BaseModel):
     presence_penalty: float | None = shared.args.presence_penalty
     stop: str | List[str] | None = None
     stream: bool | None = False
+    stream_options: StreamOptions | None = None
     suffix: str | None = None
     temperature: float | None = shared.args.temperature
     top_p: float | None = shared.args.top_p
@@ -151,6 +156,7 @@ class ChatCompletionRequestParams(BaseModel):
     presence_penalty: float | None = shared.args.presence_penalty
     stop: str | List[str] | None = None
     stream: bool | None = False
+    stream_options: StreamOptions | None = None
     temperature: float | None = shared.args.temperature
     top_p: float | None = shared.args.top_p
     user: str | None = Field(default=None, description="Unused parameter.")

From 7a63a560430641d55b685994c097ab7a588b6547 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 11 Mar 2026 12:53:19 -0700
Subject: [PATCH 028/210] Update llama.cpp

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 12 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index a3d4d1e6..057ff94e 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.24/exllamav3-0.0.24+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.24/exllamav3-0.0.24+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 3211f251..19c45361 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+rocm6.4-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+rocm6.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm6.4-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm6.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 8d452114..4f8ea026 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 525ceed5..9b5bf1ab 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 86b65a97..7e49bbff 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 61c9ef73..00c6d0d6 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 3d0785a3..2d8dad0a 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+rocm6.4-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+rocm6.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm6.4-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm6.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 6805e209..299f50c1 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 5a8ed87b..dbc9247c 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index fafa23cf..66a99ba6 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 3ef59f97..4338b20c 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 6039357d..7d262302 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.87.0/llama_cpp_binaries-0.87.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 24977846fbd34eca6bb40292381fe2820fe58343 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 11 Mar 2026 13:14:26 -0700
Subject: [PATCH 029/210] Update AMD ROCm from 6.4 to 7.2

---
 one_click.py                               | 10 ++++++----
 requirements/full/requirements_amd.txt     |  4 ++--
 requirements/portable/requirements_amd.txt |  4 ++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/one_click.py b/one_click.py
index 189d81b8..fdffd0a0 100644
--- a/one_click.py
+++ b/one_click.py
@@ -91,7 +91,7 @@ def get_gpu_choice():
                 "What is your GPU?",
                 {
                     'A': 'NVIDIA',
-                    'B': 'AMD - Linux/macOS only, requires ROCm 6.4',
+                    'B': 'AMD - Linux only, ROCm 7.2',
                     'C': 'Apple M Series',
                     'D': 'Intel Arc (beta)',
                     'N': 'CPU mode'
@@ -115,7 +115,8 @@ def get_pytorch_install_command(gpu_choice):
     if gpu_choice == "NVIDIA_CUDA128":
         return base_cmd + "--index-url https://download.pytorch.org/whl/cu128"
     elif gpu_choice == "AMD":
-        return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.4"
+        py_tag = f"cp{PYTHON_VERSION.replace('.', '')}"
+        return f"python -m pip install https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torch-{TORCH_VERSION}%2Brocm7.2.0.lw.git7e1940d4-{py_tag}-{py_tag}-linux_x86_64.whl"
     elif gpu_choice in ["APPLE", "NONE"]:
         return base_cmd + "--index-url https://download.pytorch.org/whl/cpu"
     elif gpu_choice == "INTEL":
@@ -131,7 +132,8 @@ def get_pytorch_update_command(gpu_choice):
     if gpu_choice == "NVIDIA_CUDA128":
         return f"{base_cmd} --index-url https://download.pytorch.org/whl/cu128"
     elif gpu_choice == "AMD":
-        return f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.4"
+        py_tag = f"cp{PYTHON_VERSION.replace('.', '')}"
+        return f"python -m pip install --upgrade https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torch-{TORCH_VERSION}%2Brocm7.2.0.lw.git7e1940d4-{py_tag}-{py_tag}-linux_x86_64.whl"
     elif gpu_choice in ["APPLE", "NONE"]:
         return f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
     elif gpu_choice == "INTEL":
@@ -266,7 +268,7 @@ def update_pytorch_and_python():
 
 
 def clean_outdated_pytorch_cuda_dependencies():
-    patterns = ["cu121", "cu122", "torch2.4", "torch2.6", "torch2.7", "torchvision", "torchaudio"]
+    patterns = ["cu121", "cu122", "rocm6", "torch2.4", "torch2.6", "torch2.7", "torchvision", "torchaudio"]
     result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True)
     matching_packages = []
 
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 19c45361..03e96789 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm6.4-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm6.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 2d8dad0a..8ccc15c9 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm6.4-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm6.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 66c976e995b554387aa5263cd5824b169c21f30a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 11 Mar 2026 18:09:40 -0300
Subject: [PATCH 030/210] Update README with ROCm 7.2 torch install URL

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8dc09f32..9a8e0a86 100644
--- a/README.md
+++ b/README.md
@@ -145,7 +145,7 @@ conda activate textgen
 |--------|---------|---------|
 | Linux/WSL | NVIDIA | `pip3 install torch==2.9.1 --index-url https://download.pytorch.org/whl/cu128` |
 | Linux/WSL | CPU only | `pip3 install torch==2.9.1 --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch==2.9.1 --index-url https://download.pytorch.org/whl/rocm6.4` |
+| Linux | AMD | `pip3 install https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torch-2.9.1%2Brocm7.2.0.lw.git7e1940d4-cp313-cp313-linux_x86_64.whl` |
 | MacOS + MPS | Any | `pip3 install torch==2.9.1` |
 | Windows | NVIDIA | `pip3 install torch==2.9.1 --index-url https://download.pytorch.org/whl/cu128` |
 | Windows | CPU only | `pip3 install torch==2.9.1` |

From bb00d96dc35f8b15f738905961c96b8c08900c87 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 11 Mar 2026 15:34:03 -0700
Subject: [PATCH 031/210] Use a new gr.DragDrop element for Sampler priority +
 update gradio

---
 js/main.js                                           | 10 +++++-----
 modules/ui_parameters.py                             |  2 +-
 requirements/full/requirements.txt                   |  4 ++--
 requirements/full/requirements_amd.txt               |  4 ++--
 requirements/full/requirements_apple_intel.txt       |  4 ++--
 requirements/full/requirements_apple_silicon.txt     |  4 ++--
 requirements/full/requirements_cpu_only.txt          |  4 ++--
 requirements/full/requirements_nowheels.txt          |  4 ++--
 requirements/portable/requirements.txt               |  4 ++--
 requirements/portable/requirements_amd.txt           |  4 ++--
 requirements/portable/requirements_apple_intel.txt   |  4 ++--
 requirements/portable/requirements_apple_silicon.txt |  4 ++--
 requirements/portable/requirements_cpu_only.txt      |  4 ++--
 requirements/portable/requirements_cuda131.txt       |  4 ++--
 requirements/portable/requirements_nowheels.txt      |  4 ++--
 requirements/portable/requirements_vulkan.txt        |  4 ++--
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/js/main.js b/js/main.js
index 1317e9e7..a7dc77bb 100644
--- a/js/main.js
+++ b/js/main.js
@@ -296,11 +296,11 @@ function doSyntaxHighlighting() {
 //------------------------------------------------
 // Add some scrollbars
 //------------------------------------------------
-const textareaElements = document.querySelectorAll(".add_scrollbar textarea");
-for(i = 0; i < textareaElements.length; i++) {
-  textareaElements[i].classList.remove("scroll-hide");
-  textareaElements[i].classList.add("pretty_scrollbar");
-  textareaElements[i].style.resize = "none";
+const scrollbarElements = document.querySelectorAll(".add_scrollbar textarea, .add_scrollbar .drag-drop-list");
+for(i = 0; i < scrollbarElements.length; i++) {
+  scrollbarElements[i].classList.remove("scroll-hide");
+  scrollbarElements[i].classList.add("pretty_scrollbar");
+  scrollbarElements[i].style.resize = "none";
 }
 
 //------------------------------------------------
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index e5eb9210..a5afd7e5 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -73,7 +73,7 @@ def create_ui():
                             gr.Markdown('## Other options')
                             shared.gradio['do_sample'] = gr.Checkbox(value=shared.settings['do_sample'], label='do_sample')
                             shared.gradio['temperature_last'] = gr.Checkbox(value=shared.settings['temperature_last'], label='temperature_last', info='Moves temperature/dynamic temperature/quadratic sampling to the end of the sampler stack, ignoring their positions in "Sampler priority".')
-                            shared.gradio['sampler_priority'] = gr.Textbox(value=shared.settings['sampler_priority'], lines=10, label='Sampler priority', info='Parameter names separated by new lines or commas.', elem_classes=['add_scrollbar'])
+                            shared.gradio['sampler_priority'] = gr.DragDrop(value=shared.settings['sampler_priority'], label='Sampler priority', info='Parameter names separated by new lines or commas.', elem_classes=['add_scrollbar'])
                             shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=shared.settings['dry_sequence_breakers'], label='dry_sequence_breakers', info='Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.')
 
                 with gr.Column():
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 057ff94e..12e7fbae 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 03e96789..19cc0d9d 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 4f8ea026..ebe26f9d 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 9b5bf1ab..49155690 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 7e49bbff..1c7c5735 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 0a924d31..63823db8 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 00c6d0d6..db23d4bf 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ rich
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 8ccc15c9..e8cd9fd9 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ rich
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 299f50c1..24c558a9 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ rich
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index dbc9247c..f2e8e691 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ rich
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 66a99ba6..296c0432 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ rich
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 4338b20c..aefce769 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ rich
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index c2fc33eb..8c3e2aac 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ rich
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 7d262302..76bb5872 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ rich
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio-4.37.2+custom.9-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.9/gradio_client-1.0.2+custom.9-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 980a9d165780b6a2dcc6c4bf6469436861079d2a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 11 Mar 2026 15:50:16 -0700
Subject: [PATCH 032/210] UI: Minor defensive changes to autosave

---
 modules/chat.py |  8 ++++++--
 modules/ui.py   | 11 +++++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 62d1492d..10969446 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -6,6 +6,7 @@ import json
 import pprint
 import re
 import shutil
+import threading
 import time
 from datetime import datetime
 from functools import partial
@@ -40,6 +41,8 @@ from modules.utils import (
 )
 from modules.web_search import add_web_search_attachments
 
+_history_file_lock = threading.Lock()
+
 
 def strftime_now(format):
     return datetime.now().strftime(format)
@@ -1200,8 +1203,9 @@ def save_history(history, unique_id, character, mode):
     if not p.parent.is_dir():
         p.parent.mkdir(parents=True)
 
-    with open(p, 'w', encoding='utf-8') as f:
-        f.write(json.dumps(history, indent=4, ensure_ascii=False))
+    with _history_file_lock:
+        with open(p, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(history, indent=4, ensure_ascii=False))
 
 
 def rename_history(old_id, new_id, character, mode):
diff --git a/modules/ui.py b/modules/ui.py
index abbfde49..2ab30563 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -303,12 +303,16 @@ def save_settings(state, preset, extensions_list, show_controls, theme_state, ma
         if k in shared.settings and k not in exclude:
             output[k] = state[k]
 
-    output['preset'] = preset
+    if preset:
+        output['preset'] = preset
     output['prompt-notebook'] = state['prompt_menu-default'] if state['show_two_notebook_columns'] else state['prompt_menu-notebook']
-    output['character'] = state['character_menu']
-    if 'user_menu' in state and state['user_menu']:
+    if state.get('character_menu'):
+        output['character'] = state['character_menu']
+    if state.get('user_menu'):
         output['user'] = state['user_menu']
     output['seed'] = int(output['seed'])
+    output['custom_stopping_strings'] = output.get('custom_stopping_strings') or ''
+    output['custom_token_bans'] = output.get('custom_token_bans') or ''
     output['show_controls'] = show_controls
     output['dark_theme'] = True if theme_state == 'dark' else False
     output.pop('instruction_template_str')
@@ -470,7 +474,6 @@ def setup_auto_save():
         'skip_special_tokens',
         'stream',
         'static_cache',
-        'truncation_length',
         'seed',
         'sampler_priority',
         'custom_stopping_strings',

From cf9ad8eafe96a3065bf3c8edba58d332de6474fe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 01:15:49 -0300
Subject: [PATCH 033/210] Initial tool-calling support in the UI

---
 modules/chat.py     | 189 ++++++++++++++++++++++++++++++++++++++++----
 modules/shared.py   |   1 +
 modules/tool_use.py |  70 ++++++++++++++++
 modules/ui.py       |   2 +
 modules/ui_chat.py  |   5 ++
 5 files changed, 253 insertions(+), 14 deletions(-)
 create mode 100644 modules/tool_use.py

diff --git a/modules/chat.py b/modules/chat.py
index 10969446..b0be2bc2 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -298,6 +298,23 @@ def generate_chat_prompt(user_input, state, **kwargs):
             if entry_meta.get('tool_calls') and messages[insert_pos].get('role') == 'assistant':
                 messages[insert_pos]['tool_calls'] = _deserialize_tool_call_arguments(entry_meta['tool_calls'])
 
+        # Expand tool_sequence from metadata (inserted AFTER assistant so that
+        # the final order is: user → tool_calls → tool_results → final_answer)
+        meta_key = f"assistant_{row_idx}"
+        tool_seq = metadata.get(meta_key, {}).get('tool_sequence', [])
+        if tool_seq:
+            for item in reversed(tool_seq):
+                if 'tool_calls' in item:
+                    messages.insert(insert_pos, {
+                        "role": "assistant", "content": "",
+                        "tool_calls": _deserialize_tool_call_arguments(item['tool_calls'])
+                    })
+                elif item.get('role') == 'tool':
+                    messages.insert(insert_pos, {
+                        "role": "tool", "content": item['content'],
+                        "tool_call_id": item.get('tool_call_id', '')
+                    })
+
         if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
             # Check for user message attachments in metadata
             user_key = f"user_{row_idx}"
@@ -367,6 +384,22 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
             messages.append({"role": "user", "content": user_input})
 
+        # Expand tool_sequence for the current entry (excluded from the
+        # history loop during regenerate — needed so the model sees prior
+        # tool calls and results when re-generating the final answer).
+        current_tool_seq = metadata.get(f"assistant_{len(history)}", {}).get('tool_sequence', [])
+        for item in current_tool_seq:
+            if 'tool_calls' in item:
+                messages.append({
+                    "role": "assistant", "content": "",
+                    "tool_calls": _deserialize_tool_call_arguments(item['tool_calls'])
+                })
+            elif item.get('role') == 'tool':
+                messages.append({
+                    "role": "tool", "content": item['content'],
+                    "tool_call_id": item.get('tool_call_id', '')
+                })
+
     if impersonate and state['mode'] != 'chat-instruct':
         messages.append({"role": "user", "content": "fake user message replace me"})
 
@@ -886,7 +919,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             }
     else:
         text, visible_text = output['internal'][-1][0], output['visible'][-1][0]
-        if regenerate:
+        if regenerate and not state.get('_tool_turn'):
             row_idx = len(output['internal']) - 1
 
             # Store the old response as a version before regenerating
@@ -984,7 +1017,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')]
 
         # Keep version metadata in sync during streaming (for regeneration)
-        if regenerate:
+        if regenerate and not state.get('_tool_turn'):
             row_idx = len(output['internal']) - 1
             key = f"assistant_{row_idx}"
             current_idx = output['metadata'][key]['current_version_index']
@@ -1012,7 +1045,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
 
     # Final sync for version metadata (in case streaming was disabled)
-    if regenerate:
+    if regenerate and not state.get('_tool_turn'):
         row_idx = len(output['internal']) - 1
         key = f"assistant_{row_idx}"
         current_idx = output['metadata'][key]['current_version_index']
@@ -1066,12 +1099,24 @@ def character_is_loaded(state, raise_exception=False):
 
 def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
     '''
-    Same as above but returns HTML for the UI
+    Same as above but returns HTML for the UI.
+    When tools are selected, wraps generation in a loop that detects
+    tool calls, executes them, and re-generates until the model stops.
+    All tool output is consolidated into a single visible chat bubble
+    using metadata['assistant_N']['tool_sequence'].
     '''
 
     if not character_is_loaded(state):
         return
 
+    # On regenerate, clear old tool_sequence metadata so it gets rebuilt
+    if regenerate:
+        history = state['history']
+        meta = history.get('metadata', {})
+        row_idx = len(history['internal']) - 1
+        if row_idx >= 0:
+            meta.get(f'assistant_{row_idx}', {}).pop('tool_sequence', None)
+
     if state['start_with'] != '' and not _continue:
         if regenerate:
             text, state['history'] = remove_last_message(state['history'])
@@ -1081,23 +1126,139 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         send_dummy_message(text, state)
         send_dummy_reply(state['start_with'], state)
 
-    history = state['history']
+    # Load tools if any are selected
+    selected = state.get('selected_tools', [])
+    if selected:
+        from modules.tool_use import load_tools, execute_tool, generate_tool_call_id
+        try:
+            from extensions.openai.utils import parseToolCall
+        except ImportError:
+            logger.warning('Tool calling requires the openai extension for parseToolCall. Disabling tools.')
+            selected = []
+
+    if selected:
+        tool_defs, tool_executors = load_tools(selected)
+        state['tools'] = tool_defs
+        tool_func_names = [t['function']['name'] for t in tool_defs]
+    else:
+        tool_func_names = None
+
+    visible_prefix = []  # Accumulated tool call summaries + results
     last_save_time = time.monotonic()
     save_interval = 8
-    for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
-        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'], last_message_only=(i > 0)), history
-        if i == 0:
-            time.sleep(0.125)  # We need this to make sure the first update goes through
+    max_tool_turns = 10
 
-        current_time = time.monotonic()
-        # Save on first iteration or if save_interval seconds have passed
-        if i == 0 or (current_time - last_save_time) >= save_interval:
-            save_history(history, state['unique_id'], state['character_menu'], state['mode'])
-            last_save_time = current_time
+    for _tool_turn in range(max_tool_turns):
+        history = state['history']
+
+        # Turn 0: use original flags; turns 2+: regenerate into the same entry
+        if _tool_turn > 0:
+            state['_tool_turn'] = True
+
+        regen = regenerate if _tool_turn == 0 else True
+        cont = _continue if _tool_turn == 0 else False
+        cur_text = text if _tool_turn == 0 else ''
+
+        for i, history in enumerate(generate_chat_reply(cur_text, state, regen, cont, loading_message=True, for_ui=True)):
+            # Prepend accumulated tool output to visible reply
+            if visible_prefix:
+                history['visible'][-1][1] = '\n\n'.join(visible_prefix + [history['visible'][-1][1]])
+
+            yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'], last_message_only=(i > 0)), history
+
+            if i == 0:
+                time.sleep(0.125)
+
+            current_time = time.monotonic()
+            if i == 0 or (current_time - last_save_time) >= save_interval:
+                save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+                last_save_time = current_time
+
+            # Early stop on tool call detection
+            if tool_func_names and parseToolCall(history['internal'][-1][1], tool_func_names):
+                break
+
+        save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+
+        # Check for tool calls
+        if not tool_func_names or shared.stop_everything:
+            break
+
+        answer = history['internal'][-1][1]
+        parsed_calls = parseToolCall(answer, tool_func_names) if answer else None
+
+        if not parsed_calls:
+            break  # No tool calls — done
+
+        # --- Process tool calls ---
+        row_idx = len(history['internal']) - 1
+        meta = history.get('metadata', {})
+        seq = meta.setdefault(f'assistant_{row_idx}', {}).setdefault('tool_sequence', [])
+
+        # Serialize tool calls
+        serialized = []
+        for tc in parsed_calls:
+            tc['id'] = generate_tool_call_id()
+            args = tc['function'].get('arguments', {})
+            serialized.append({
+                'id': tc['id'],
+                'type': 'function',
+                'function': {
+                    'name': tc['function']['name'],
+                    'arguments': json.dumps(args) if isinstance(args, dict) else args
+                }
+            })
+
+        seq.append({'tool_calls': serialized})
+
+        # Clear internal (raw tool markup)
+        history['internal'][-1][1] = ''
+
+        # Add call summary to visible prefix
+        call_summary = ', '.join(f'{tc["function"]["name"]}(...)' for tc in parsed_calls)
+        visible_prefix.append('Calling: ' + call_summary)
+
+        # Execute tools, store results
+        for tc in parsed_calls:
+            fn_name = tc['function']['name']
+            fn_args = tc['function'].get('arguments', {})
+            result = execute_tool(fn_name, fn_args, tool_executors)
+
+            seq.append({'role': 'tool', 'content': result, 'tool_call_id': tc['id']})
+            try:
+                pretty_result = json.dumps(json.loads(result), indent=2, ensure_ascii=False)
+            except (json.JSONDecodeError, TypeError):
+                pretty_result = result
+
+            visible_prefix.append(f'**{fn_name}**\n```json\n{pretty_result}\n```')
+
+        # Show tool results
+        history['visible'][-1][1] = '\n\n'.join(visible_prefix)
+        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
+        save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+
+        state['history'] = history
+
+    state.pop('_tool_turn', None)
+    state['history'] = history
+
+    # Sync version metadata so swipes show the full visible (with tool prefix)
+    if visible_prefix and history.get('metadata'):
+        row_idx = len(history['internal']) - 1
+        key = f"assistant_{row_idx}"
+        meta_entry = history['metadata'].get(key, {})
+        if 'versions' in meta_entry and 'current_version_index' in meta_entry:
+            current_idx = meta_entry['current_version_index']
+            if current_idx < len(meta_entry['versions']):
+                meta_entry['versions'][current_idx].update({
+                    'content': history['internal'][row_idx][1],
+                    'visible_content': history['visible'][row_idx][1]
+                })
 
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
 
 
+
 def remove_last_message(history):
     if 'metadata' not in history:
         history['metadata'] = {}
diff --git a/modules/shared.py b/modules/shared.py
index dbd805a1..395ca83c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -263,6 +263,7 @@ settings = {
     'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>". Reply directly, without starting the reply with the character name.\n\n<|prompt|>',
     'enable_web_search': False,
     'web_search_pages': 3,
+    'selected_tools': [],
     'prompt-notebook': '',
     'preset': 'Qwen3 - Thinking' if (user_data_dir / 'presets/Qwen3 - Thinking.yaml').exists() else None,
     'max_new_tokens': 512,
diff --git a/modules/tool_use.py b/modules/tool_use.py
new file mode 100644
index 00000000..cb1e140d
--- /dev/null
+++ b/modules/tool_use.py
@@ -0,0 +1,70 @@
+import importlib.util
+import json
+import random
+
+from modules import shared
+from modules.logging_colors import logger
+
+
+def get_available_tools():
+    """Return sorted list of tool script names from user_data/tools/*.py."""
+    tools_dir = shared.user_data_dir / 'tools'
+    tools_dir.mkdir(parents=True, exist_ok=True)
+    return sorted(p.stem for p in tools_dir.glob('*.py'))
+
+
+def load_tools(selected_names):
+    """
+    Import selected tool scripts and return their definitions and executors.
+    Returns (tool_defs, executors) where:
+      - tool_defs: list of OpenAI-format tool dicts
+      - executors: dict mapping function_name -> execute callable
+    """
+    tool_defs = []
+    executors = {}
+    for name in selected_names:
+        path = shared.user_data_dir / 'tools' / f'{name}.py'
+        if not path.exists():
+            continue
+
+        try:
+            spec = importlib.util.spec_from_file_location(f"tool_{name}", str(path))
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+        except Exception:
+            logger.exception(f'Failed to load tool script "{name}"')
+            continue
+
+        tool_def = getattr(module, 'tool', None)
+        execute_fn = getattr(module, 'execute', None)
+        if tool_def is None or execute_fn is None:
+            logger.warning(f'Tool "{name}" is missing a "tool" dict or "execute" function.')
+            continue
+
+        func_name = tool_def.get('function', {}).get('name', name)
+        tool_defs.append(tool_def)
+        executors[func_name] = execute_fn
+
+    return tool_defs, executors
+
+
+def generate_tool_call_id():
+    """Generate a unique tool call ID (e.g. 'call_a1b2c3d4')."""
+    chars = "abcdefghijklmnopqrstuvwxyz0123456789"
+    return "call_" + "".join(random.choice(chars) for _ in range(8))
+
+
+def execute_tool(func_name, arguments, executors):
+    """Execute a tool by function name. Returns result as a JSON string."""
+    fn = executors.get(func_name)
+    if fn is None:
+        return json.dumps({"error": f"Unknown tool: {func_name}"})
+
+    try:
+        if isinstance(arguments, str):
+            arguments = json.loads(arguments)
+        result = fn(arguments)
+        return json.dumps(result) if not isinstance(result, str) else result
+    except Exception as e:
+        logger.exception(f'Tool "{func_name}" execution failed')
+        return json.dumps({"error": str(e)})
diff --git a/modules/ui.py b/modules/ui.py
index 2ab30563..3f39a1a4 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -199,6 +199,7 @@ def list_interface_input_elements():
         'unique_id',
         'textbox',
         'start_with',
+        'selected_tools',
         'mode',
         'chat_style',
         'chat-instruct_command',
@@ -424,6 +425,7 @@ def setup_auto_save():
         'user_bio',
         'custom_system_message',
         'chat_template_str',
+        'selected_tools',
 
         # Parameters tab (ui_parameters.py) - Generation parameters
         'preset_menu',
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 74da0a40..9c7424e7 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -91,6 +91,11 @@ def create_ui():
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
+                from modules.tool_use import get_available_tools
+                shared.gradio['selected_tools'] = gr.CheckboxGroup(choices=get_available_tools(), value=[], label='Tools', info='Functions the model can call during generation.')
+
+                gr.HTML("<div class='sidebar-vertical-separator'></div>")
+
                 with gr.Row():
                     shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='In instruct and chat-instruct modes, the template under Parameters > Instruction template is used.', elem_id='chat-mode')
 

From 0d6203871050f5db14b5197899f7e11fbb152fd2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 01:36:07 -0300
Subject: [PATCH 034/210] Add tools refresh button and _tool_turn comment

---
 modules/chat.py    | 4 +++-
 modules/ui_chat.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index b0be2bc2..ad1095f3 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1151,7 +1151,9 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
     for _tool_turn in range(max_tool_turns):
         history = state['history']
 
-        # Turn 0: use original flags; turns 2+: regenerate into the same entry
+        # Turn 0: use original flags; turns 2+: regenerate into the same entry.
+        # _tool_turn tells chatbot_wrapper to skip version creation/sync so
+        # that intermediate tool-loop regenerations don't pollute swipe history.
         if _tool_turn > 0:
             state['_tool_turn'] = True
 
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 9c7424e7..d5b13094 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -93,6 +93,7 @@ def create_ui():
 
                 from modules.tool_use import get_available_tools
                 shared.gradio['selected_tools'] = gr.CheckboxGroup(choices=get_available_tools(), value=[], label='Tools', info='Functions the model can call during generation.')
+                ui.create_refresh_button(shared.gradio['selected_tools'], lambda: None, lambda: {'choices': get_available_tools()}, 'refresh-button')
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 

From b5cac2e3b2bc20d2343c2c7dbecdcb01197d422a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 01:48:19 -0300
Subject: [PATCH 035/210] Fix swipes and edit for tool calling in the UI

---
 modules/chat.py | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index ad1095f3..f3020ee1 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1109,13 +1109,15 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
     if not character_is_loaded(state):
         return
 
-    # On regenerate, clear old tool_sequence metadata so it gets rebuilt
+    # On regenerate, clear old tool_sequence metadata so it gets rebuilt.
+    # Save it first so it can be stored per-version below.
+    _old_tool_sequence = None
     if regenerate:
         history = state['history']
         meta = history.get('metadata', {})
         row_idx = len(history['internal']) - 1
         if row_idx >= 0:
-            meta.get(f'assistant_{row_idx}', {}).pop('tool_sequence', None)
+            _old_tool_sequence = meta.get(f'assistant_{row_idx}', {}).pop('tool_sequence', None)
 
     if state['start_with'] != '' and not _continue:
         if regenerate:
@@ -1128,6 +1130,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
 
     # Load tools if any are selected
     selected = state.get('selected_tools', [])
+    parseToolCall = None
     if selected:
         from modules.tool_use import load_tools, execute_tool, generate_tool_call_id
         try:
@@ -1169,6 +1172,16 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
             yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'], last_message_only=(i > 0)), history
 
             if i == 0:
+                # Save old tool_sequence into version 0 (created by chatbot_wrapper
+                # on the first yield).  Only needed on the first regeneration when
+                # versions didn't previously exist.
+                if _old_tool_sequence is not None and _tool_turn == 0:
+                    _ri = len(history['internal']) - 1
+                    _versions = history.get('metadata', {}).get(f'assistant_{_ri}', {}).get('versions', [])
+                    if _versions and 'tool_sequence' not in _versions[0]:
+                        _versions[0]['tool_sequence'] = _old_tool_sequence
+                    _old_tool_sequence = None
+
                 time.sleep(0.125)
 
             current_time = time.monotonic()
@@ -1252,15 +1265,18 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         if 'versions' in meta_entry and 'current_version_index' in meta_entry:
             current_idx = meta_entry['current_version_index']
             if current_idx < len(meta_entry['versions']):
-                meta_entry['versions'][current_idx].update({
+                version_update = {
                     'content': history['internal'][row_idx][1],
                     'visible_content': history['visible'][row_idx][1]
-                })
+                }
+                ts = meta_entry.get('tool_sequence')
+                if ts is not None:
+                    version_update['tool_sequence'] = ts
+                meta_entry['versions'][current_idx].update(version_update)
 
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
 
 
-
 def remove_last_message(history):
     if 'metadata' not in history:
         history['metadata'] = {}
@@ -2163,11 +2179,15 @@ def handle_edit_message_click(state):
         original_visible = history['visible'][message_index][role_idx]
         original_timestamp = history['metadata'][key].get('timestamp', get_current_timestamp())
 
-        history['metadata'][key]["versions"] = [{
+        version_entry = {
             "content": original_content,
             "visible_content": original_visible,
             "timestamp": original_timestamp
-        }]
+        }
+        ts = history['metadata'][key].get('tool_sequence')
+        if ts is not None:
+            version_entry['tool_sequence'] = ts
+        history['metadata'][key]["versions"] = [version_entry]
 
     history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
     history['visible'][message_index][role_idx] = html.escape(new_text)
@@ -2215,6 +2235,14 @@ def handle_navigate_version_click(state):
     history['internal'][message_index][msg_content_idx] = version_to_load['content']
     history['visible'][message_index][msg_content_idx] = version_to_load['visible_content']
     metadata['current_version_index'] = new_idx
+
+    # Restore per-version tool_sequence so follow-up prompts see consistent context
+    version_ts = version_to_load.get('tool_sequence')
+    if version_ts is not None:
+        metadata['tool_sequence'] = version_ts
+    else:
+        metadata.pop('tool_sequence', None)
+
     update_message_metadata(history['metadata'], role, message_index, timestamp=version_to_load['timestamp'])
 
     # Redraw and save

From 2549f7c33b8989c9604edc6476fa2354f9bb6662 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 10:28:49 -0300
Subject: [PATCH 036/210] API: Add tool_choice support and fix tool_calls spec
 compliance

---
 extensions/openai/completions.py | 24 ++++++++++++++++++------
 extensions/openai/typing.py      |  1 +
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 03c4b03e..8d3cce57 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -218,6 +218,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0:
         tools = validateTools(body['tools'])  # raises InvalidRequestError if validation fails
 
+    tool_choice = body.get('tool_choice', None)
+    if tool_choice == "none":
+        tools = None  # Disable tool detection entirely
+
     messages = body['messages']
     for m in messages:
         if 'role' not in m:
@@ -367,6 +371,12 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     end_last_tool_call = 0
     supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None
 
+    # Filter supported_tools when tool_choice specifies a particular function
+    if supported_tools and isinstance(tool_choice, dict):
+        specified_func = tool_choice.get("function", {}).get("name")
+        if specified_func and specified_func in supported_tools:
+            supported_tools = [specified_func]
+
     for a in generator:
         answer = a['internal'][-1][1]
 
@@ -375,11 +385,17 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             if len(tool_call) > 0:
                 for tc in tool_call:
                     tc["id"] = getToolCallId()
-                    tc["index"] = len(tool_calls)
+                    if stream:
+                        tc["index"] = len(tool_calls)
                     tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
                     tool_calls.append(tc)
                 end_last_tool_call = len(answer)
 
+        # Stop generation before streaming content if tool_calls were detected,
+        # so that raw tool markup is not sent as content deltas.
+        if len(tool_calls) > 0:
+            break
+
         if stream:
             len_seen = len(seen_content)
             new_content = answer[len_seen:]
@@ -394,10 +410,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             seen_content = answer
             yield chunk
 
-        # stop generation if tool_calls were generated previously
-        if len(tool_calls) > 0:
-            break
-
     token_count = shared.model.last_prompt_token_count if hasattr(shared.model, 'last_prompt_token_count') else 0
     completion_token_count = len(encode(answer)[0])
     stop_reason = "stop"
@@ -441,7 +453,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             resp_list: [{
                 "index": 0,
                 "finish_reason": stop_reason,
-                "message": {"role": "assistant", "refusal": None, "content": answer, **({"tool_calls": tool_calls} if tool_calls else {})},
+                "message": {"role": "assistant", "refusal": None, "content": None if tool_calls else answer, **({"tool_calls": tool_calls} if tool_calls else {})},
                 "logprobs": None,
             }],
             "usage": {
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 078bd201..4d0f4a4a 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -150,6 +150,7 @@ class ChatCompletionRequestParams(BaseModel):
     function_call: str | dict | None = Field(default=None, description="Unused parameter.")
     functions: List[dict] | None = Field(default=None, description="Unused parameter.")
     tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.")
+    tool_choice: str | dict | None = Field(default=None, description="Controls tool use: 'auto', 'none', 'required', or {\"type\": \"function\", \"function\": {\"name\": \"...\"}}.")
     logit_bias: dict | None = None
     max_tokens: int | None = None
     n: int | None = Field(default=1, description="Unused parameter.")

From 09723c9988cf8144286c88832cf2627e2df4938d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 12:43:03 -0300
Subject: [PATCH 037/210] API: Include /v1 in the printed API URL for easier
 integration

---
 extensions/openai/script.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 94c7650f..e3726bc8 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -453,11 +453,11 @@ def run_server():
             port,
             shared.args.public_api_id,
             max_attempts=3,
-            on_start=lambda url: logger.info(f'OpenAI-compatible API URL:\n\n{url}\n')
+            on_start=lambda url: logger.info(f'OpenAI-compatible API URL:\n\n{url}/v1\n')
         )
     else:
         url_proto = 'https://' if (ssl_certfile and ssl_keyfile) else 'http://'
-        urls = [f'{url_proto}{addr}:{port}' for addr in server_addrs]
+        urls = [f'{url_proto}{addr}:{port}/v1' for addr in server_addrs]
         if len(urls) > 1:
             logger.info('OpenAI-compatible API URLs:\n\n' + '\n'.join(urls) + '\n')
         else:

From 4b6c9db1c9137a81f363e3dc7c0665441be83735 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 13:11:28 -0300
Subject: [PATCH 038/210] UI: Fix stale tool_sequence after edit and
 chat-instruct tool rendering

---
 modules/chat.py   | 1 +
 modules/shared.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index f3020ee1..5cf550d6 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -2191,6 +2191,7 @@ def handle_edit_message_click(state):
 
     history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
     history['visible'][message_index][role_idx] = html.escape(new_text)
+    history['metadata'][key].pop('tool_sequence', None)
 
     add_message_version(history, role, message_index, is_current=True)
 
diff --git a/modules/shared.py b/modules/shared.py
index 395ca83c..8c0aad9a 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -348,7 +348,7 @@ settings = {
     'greeting': 'How can I help you today?',
     'custom_system_message': '',
     'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n    {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {{- '' + message['content'] + '\\n\\n' -}}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n        {%- else -%}\n            {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{-'### Response:\\n'-}}\n{%- endif -%}",
-    'chat_template_str': "{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {%- if message['content'] -%}\n            {{- message['content'] + '\\n\\n' -}}\n        {%- endif -%}\n        {%- if user_bio -%}\n            {{- user_bio + '\\n\\n' -}}\n        {%- endif -%}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{- name1 + ': ' + message['content'] + '\\n'-}}\n        {%- else -%}\n            {{- name2 + ': ' + message['content'] + '\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n    {{- name2 + ':' -}}\n{%- endif %}",
+    'chat_template_str': "{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {%- if message['content'] -%}\n            {{- message['content'] + '\\n\\n' -}}\n        {%- endif -%}\n        {%- if user_bio -%}\n            {{- user_bio + '\\n\\n' -}}\n        {%- endif -%}\n    {%- elif message['role'] == 'tool' -%}\n        {{- '[Tool result: ' + message['content'] + ']\\n' -}}\n    {%- elif message['role'] == 'user' -%}\n        {{- name1 + ': ' + message['content'] + '\\n'-}}\n    {%- elif message['tool_calls'] is defined and message['tool_calls'] -%}\n        {%- for tc in message['tool_calls'] -%}\n            {{- '[Calling: ' + tc['function']['name'] + '(' + tc['function']['arguments'] + ')]\\n' -}}\n        {%- endfor -%}\n    {%- else -%}\n        {{- name2 + ': ' + message['content'] + '\\n' -}}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n    {{- name2 + ':' -}}\n{%- endif %}",
 
     # Extensions
     'default_extensions': [],

From 5a017aa3380ae273e0ce32fe02709f93cb710d20 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 13:23:01 -0300
Subject: [PATCH 039/210] API: Several OpenAI spec compliance fixes

- Return proper OpenAI error format ({"error": {...}}) instead of HTTP 500 for validation errors
- Send data: [DONE] at the end of SSE streams
- Fix finish_reason so "tool_calls" takes priority over "length"
- Stop including usage in streaming chunks when include_usage is not set
- Handle "developer" role in messages (treated same as "system")
- Add logprobs and top_logprobs parameters for chat completions
- Fix chat completions logprobs not working with llama.cpp and ExLlamav3 backends
- Add max_completion_tokens as an alias for max_tokens in chat completions
---
 extensions/openai/completions.py | 25 ++++++++++++++++++-------
 extensions/openai/script.py      | 19 +++++++++++++++++++
 extensions/openai/typing.py      |  9 +++++++++
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 8d3cce57..a8b899d5 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -105,6 +105,10 @@ def process_parameters(body, is_legacy=False):
             logits_processor = [LogitsBiasProcessor(logit_bias)]
 
         logprobs = body.get('logprobs', None)
+        top_logprobs = body.get('top_logprobs', None)
+        # For chat completions, logprobs is a bool; use top_logprobs for the count
+        if logprobs is True:
+            logprobs = top_logprobs if top_logprobs and top_logprobs > 0 else 5
         if logprobs is not None and logprobs > 0:
             generate_params['logprob_proc'] = LogprobProcessor(logprobs)
             logits_processor.extend([generate_params['logprob_proc']])
@@ -191,7 +195,7 @@ def convert_history(history):
             if "tool_call_id" in entry:
                 meta["tool_call_id"] = entry["tool_call_id"]
             chat_dialogue.append(['', '', content, meta])
-        elif role == "system":
+        elif role in ("system", "developer"):
             system_message += f"\n{content}" if system_message else content
 
     if not user_input_last:
@@ -339,9 +343,13 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             }],
         }
 
-        if logprob_proc:  # not official for chat yet
+        if logprob_proc:
             top_logprobs = convert_logprobs_to_tiktoken(model=requested_model, logprobs=logprob_proc.token_alternatives)
             chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
+        elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
+            backend_logprobs = get_logprobs_from_backend()
+            if backend_logprobs:
+                chunk[resp_list][0]["logprobs"] = {'top_logprobs': [backend_logprobs]}
 
         return chunk
 
@@ -412,11 +420,12 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
 
     token_count = shared.model.last_prompt_token_count if hasattr(shared.model, 'last_prompt_token_count') else 0
     completion_token_count = len(encode(answer)[0])
-    stop_reason = "stop"
     if len(tool_calls) > 0:
         stop_reason = "tool_calls"
-    if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
+    elif token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
         stop_reason = "length"
+    else:
+        stop_reason = "stop"
 
     if stream:
         chunk = chat_streaming_chunk(chunk_tool_calls=tool_calls)
@@ -441,7 +450,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
                 "usage": usage
             }
         else:
-            chunk['usage'] = usage
             yield chunk
     else:
         resp = {
@@ -462,9 +470,13 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
                 "total_tokens": token_count + completion_token_count
             }
         }
-        if logprob_proc:  # not official for chat yet
+        if logprob_proc:
             top_logprobs = convert_logprobs_to_tiktoken(model=requested_model, logprobs=logprob_proc.token_alternatives)
             resp[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
+        elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
+            backend_logprobs = get_logprobs_from_backend()
+            if backend_logprobs:
+                resp[resp_list][0]["logprobs"] = {'top_logprobs': [backend_logprobs]}
 
         yield resp
 
@@ -702,7 +714,6 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                 "usage": usage
             }
         else:
-            chunk["usage"] = usage
             yield chunk
 
 
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index e3726bc8..f161e1e4 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -21,6 +21,7 @@ import extensions.openai.completions as OAIcompletions
 import extensions.openai.logits as OAIlogits
 import extensions.openai.models as OAImodels
 from extensions.openai.tokens import token_count, token_decode, token_encode
+from extensions.openai.errors import OpenAIError
 from extensions.openai.utils import _start_cloudflared
 from modules import shared
 from modules.logging_colors import logger
@@ -94,6 +95,20 @@ app.add_middleware(
 )
 
 
+@app.exception_handler(OpenAIError)
+async def openai_error_handler(request: Request, exc: OpenAIError):
+    error_type = "server_error" if exc.code >= 500 else "invalid_request_error"
+    return JSONResponse(
+        status_code=exc.code,
+        content={"error": {
+            "message": exc.message,
+            "type": error_type,
+            "param": getattr(exc, 'param', None),
+            "code": None
+        }}
+    )
+
+
 @app.middleware("http")
 async def validate_host_header(request: Request, call_next):
     # Be strict about only approving access to localhost by default
@@ -136,6 +151,8 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
                         break
 
                     yield {"data": json.dumps(resp)}
+
+                yield {"data": "[DONE]"}
             finally:
                 stop_event.set()
                 response.close()
@@ -176,6 +193,8 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
                         break
 
                     yield {"data": json.dumps(resp)}
+
+                yield {"data": "[DONE]"}
             finally:
                 stop_event.set()
                 response.close()
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 4d0f4a4a..80831c44 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -152,7 +152,10 @@ class ChatCompletionRequestParams(BaseModel):
     tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.")
     tool_choice: str | dict | None = Field(default=None, description="Controls tool use: 'auto', 'none', 'required', or {\"type\": \"function\", \"function\": {\"name\": \"...\"}}.")
     logit_bias: dict | None = None
+    logprobs: bool | None = None
+    top_logprobs: int | None = None
     max_tokens: int | None = None
+    max_completion_tokens: int | None = None
     n: int | None = Field(default=1, description="Unused parameter.")
     presence_penalty: float | None = shared.args.presence_penalty
     stop: str | List[str] | None = None
@@ -162,6 +165,12 @@ class ChatCompletionRequestParams(BaseModel):
     top_p: float | None = shared.args.top_p
     user: str | None = Field(default=None, description="Unused parameter.")
 
+    @model_validator(mode='after')
+    def resolve_max_tokens(self):
+        if self.max_tokens is None and self.max_completion_tokens is not None:
+            self.max_tokens = self.max_completion_tokens
+        return self
+
     mode: str = Field(default='instruct', description="Valid options: instruct, chat, chat-instruct.")
 
     instruction_template: str | None = Field(default=None, description="An instruction template defined under text-generation-webui/user_data/instruction-templates. If not set, the correct template will be automatically obtained from the model metadata.")

From fb1b3b6ddf861dd3da46bde439e8bb8df8973230 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 14:16:34 -0300
Subject: [PATCH 040/210] API: Rewrite logprobs for OpenAI spec compliance
 across all backends

- Rewrite logprobs output format to match the OpenAI specification for
  both chat completions and completions endpoints
- Fix top_logprobs count being ignored for llama.cpp and ExLlamav3
  backends in chat completions (always returned 1 instead of requested N)
- Fix non-streaming responses only returning logprobs for the last token
  instead of all generated tokens (affects all HF-based loaders)
- Fix logprobs returning null for non-streaming chat requests on HF loaders
- Fix off-by-one returning one extra top alternative on HF loaders
---
 extensions/openai/completions.py | 185 ++++++++++++++++++++++++-------
 modules/text_generation.py       |   3 +
 modules/transformers_loader.py   |   4 +-
 3 files changed, 149 insertions(+), 43 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index a8b899d5..10cdbf42 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -37,35 +37,114 @@ def load_chat_template_file(filepath):
     return text
 
 
-def get_logprobs_from_backend():
-    """Read logprobs captured from llama.cpp or ExLlamav3 native backend."""
+def _get_raw_logprob_entries(offset=0):
+    """Get raw logprob entries from llama.cpp/ExLlamav3 backend, starting from offset.
+
+    Returns (new_entries, new_offset).
+    """
     if not hasattr(shared.model, 'last_completion_probabilities') or not shared.model.last_completion_probabilities:
+        return [], offset
+
+    all_entries = shared.model.last_completion_probabilities
+    new_entries = all_entries[offset:]
+    return new_entries, len(all_entries)
+
+
+def _dict_to_logprob_entries(token_dict):
+    """Convert a flat {token: logprob} dict (from LogprobProcessor) to raw entry format."""
+    if not token_dict:
+        return []
+
+    return [{"top_logprobs": [{"token": t, "logprob": lp} for t, lp in token_dict.items()]}]
+
+
+def _parse_entry_top(entry):
+    """Extract the top logprobs list from a raw entry, handling both key names."""
+    return entry.get('top_logprobs', entry.get('top_probs', []))
+
+
+def format_chat_logprobs(entries):
+    """Format logprob entries into OpenAI chat completions logprobs format.
+
+    Output: {"content": [{"token", "logprob", "bytes", "top_logprobs": [...]}]}
+    """
+    if not entries:
         return None
 
-    # Both backends store data in shared.model.last_completion_probabilities
-    # Format: [{"top_logprobs": [{"token": "text", "logprob": -0.5}, ...]}, ...]
-    result = {}
-    for entry in shared.model.last_completion_probabilities:
-        top = entry.get('top_logprobs', entry.get('top_probs', []))
+    content = []
+    for entry in entries:
+        top = _parse_entry_top(entry)
+        if not top:
+            continue
+
+        chosen = top[0]
+        token_str = chosen.get('token', '')
+        token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+
+        top_list = []
         for item in top:
-            token = item.get('token', '')
-            logprob = item.get('logprob', item.get('prob', 0))
-            result[token] = logprob
+            t = item.get('token', '')
+            lp = item.get('logprob', item.get('prob', 0))
+            top_list.append({
+                "token": t,
+                "logprob": lp,
+                "bytes": list(t.encode('utf-8')) if t else None
+            })
 
-    return result
+        content.append({
+            "token": token_str,
+            "logprob": token_logprob,
+            "bytes": list(token_str.encode('utf-8')) if token_str else None,
+            "top_logprobs": top_list
+        })
+
+    return {"content": content} if content else None
 
 
-def convert_logprobs_to_tiktoken(model, logprobs):
-    # more problems than it's worth.
-    # try:
-    #     encoder = tiktoken.encoding_for_model(model)
-    #     # just pick the first one if it encodes to multiple tokens... 99.9% not required and maybe worse overall.
-    #     return dict([(encoder.decode([encoder.encode(token)[0]]), prob) for token, prob in logprobs.items()])
-    # except KeyError:
-    #     # assume native tokens if we can't find the tokenizer
-    #     return logprobs
+def format_completion_logprobs(entries):
+    """Format logprob entries into OpenAI completions logprobs format.
 
-    return logprobs
+    Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "text_offset"}
+    """
+    if not entries:
+        return None
+
+    tokens = []
+    token_logprobs = []
+    top_logprobs = []
+    text_offset = []
+    offset = 0
+
+    for entry in entries:
+        top = _parse_entry_top(entry)
+        if not top:
+            continue
+
+        chosen = top[0]
+        token_str = chosen.get('token', '')
+        token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+
+        tokens.append(token_str)
+        token_logprobs.append(token_logprob)
+        text_offset.append(offset)
+        offset += len(token_str)
+
+        top_dict = {}
+        for item in top:
+            t = item.get('token', '')
+            lp = item.get('logprob', item.get('prob', 0))
+            top_dict[t] = lp
+        top_logprobs.append(top_dict)
+
+    if not tokens:
+        return None
+
+    return {
+        "tokens": tokens,
+        "token_logprobs": token_logprobs,
+        "top_logprobs": top_logprobs,
+        "text_offset": text_offset
+    }
 
 
 def process_parameters(body, is_legacy=False):
@@ -90,6 +169,14 @@ def process_parameters(body, is_legacy=False):
         elif isinstance(body['stop'], list):
             generate_params['custom_stopping_strings'] = body['stop']
 
+    # Resolve logprobs: for chat completions, logprobs is a bool and the count
+    # comes from top_logprobs. Normalize to an int for all backends.
+    logprobs = body.get('logprobs', None)
+    top_logprobs = body.get('top_logprobs', None)
+    if logprobs is True:
+        logprobs = top_logprobs if top_logprobs and top_logprobs > 0 else 5
+        generate_params['logprobs'] = logprobs
+
     # For llama.cpp and ExLlamav3 native, logit_bias and logprobs are forwarded natively
     if shared.args.loader not in ('llama.cpp', 'ExLlamav3'):
         from transformers import LogitsProcessorList
@@ -104,11 +191,6 @@ def process_parameters(body, is_legacy=False):
         if logit_bias:  # {str: float, ...}
             logits_processor = [LogitsBiasProcessor(logit_bias)]
 
-        logprobs = body.get('logprobs', None)
-        top_logprobs = body.get('top_logprobs', None)
-        # For chat completions, logprobs is a bool; use top_logprobs for the count
-        if logprobs is True:
-            logprobs = top_logprobs if top_logprobs and top_logprobs > 0 else 5
         if logprobs is not None and logprobs > 0:
             generate_params['logprob_proc'] = LogprobProcessor(logprobs)
             logits_processor.extend([generate_params['logprob_proc']])
@@ -317,6 +399,9 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
 
     requested_model = generate_params.pop('model')
     logprob_proc = generate_params.pop('logprob_proc', None)
+    if logprob_proc:
+        logprob_proc.token_alternatives_history.clear()
+    chat_logprobs_offset = [0]  # mutable for closure access in streaming
 
     def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False):
         # begin streaming
@@ -344,12 +429,16 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         }
 
         if logprob_proc:
-            top_logprobs = convert_logprobs_to_tiktoken(model=requested_model, logprobs=logprob_proc.token_alternatives)
-            chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
+            entries = _dict_to_logprob_entries(logprob_proc.token_alternatives)
+            formatted = format_chat_logprobs(entries)
+            if formatted:
+                chunk[resp_list][0]["logprobs"] = formatted
         elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
-            backend_logprobs = get_logprobs_from_backend()
-            if backend_logprobs:
-                chunk[resp_list][0]["logprobs"] = {'top_logprobs': [backend_logprobs]}
+            entries, chat_logprobs_offset[0] = _get_raw_logprob_entries(chat_logprobs_offset[0])
+            if entries:
+                formatted = format_chat_logprobs(entries)
+                if formatted:
+                    chunk[resp_list][0]["logprobs"] = formatted
 
         return chunk
 
@@ -471,12 +560,18 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             }
         }
         if logprob_proc:
-            top_logprobs = convert_logprobs_to_tiktoken(model=requested_model, logprobs=logprob_proc.token_alternatives)
-            resp[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
+            all_entries = []
+            for alt in logprob_proc.token_alternatives_history:
+                all_entries.extend(_dict_to_logprob_entries(alt))
+            formatted = format_chat_logprobs(all_entries)
+            if formatted:
+                resp[resp_list][0]["logprobs"] = formatted
         elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
-            backend_logprobs = get_logprobs_from_backend()
-            if backend_logprobs:
-                resp[resp_list][0]["logprobs"] = {'top_logprobs': [backend_logprobs]}
+            raw = getattr(shared.model, 'last_completion_probabilities', None)
+            if raw:
+                formatted = format_chat_logprobs(raw)
+                if formatted:
+                    resp[resp_list][0]["logprobs"] = formatted
 
         yield resp
 
@@ -518,6 +613,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
         generate_params['stop_event'] = stop_event
     requested_model = generate_params.pop('model')
     logprob_proc = generate_params.pop('logprob_proc', None)
+    if logprob_proc:
+        logprob_proc.token_alternatives_history.clear()
     suffix = body['suffix'] if body['suffix'] else ''
     echo = body['echo']
 
@@ -583,10 +680,13 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                     stop_reason = "length"
 
                 if logprob_proc:
-                    completion_logprobs = {'top_logprobs': [logprob_proc.token_alternatives]}
+                    all_entries = []
+                    for alt in logprob_proc.token_alternatives_history:
+                        all_entries.extend(_dict_to_logprob_entries(alt))
+                    completion_logprobs = format_completion_logprobs(all_entries)
                 elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
-                    backend_logprobs = get_logprobs_from_backend()
-                    completion_logprobs = {'top_logprobs': [backend_logprobs]} if backend_logprobs else None
+                    raw = getattr(shared.model, 'last_completion_probabilities', None)
+                    completion_logprobs = format_completion_logprobs(raw)
                 else:
                     completion_logprobs = None
 
@@ -633,14 +733,15 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
         # Check if usage should be included in streaming chunks per OpenAI spec
         stream_options = body.get('stream_options')
         include_usage = bool(stream_options) and bool(stream_options.get('include_usage') if isinstance(stream_options, dict) else getattr(stream_options, 'include_usage', False))
+        cmpl_logprobs_offset = [0]  # mutable for closure access in streaming
 
         def text_streaming_chunk(content):
             # begin streaming
             if logprob_proc:
-                chunk_logprobs = {'top_logprobs': [logprob_proc.token_alternatives]}
+                chunk_logprobs = format_completion_logprobs(_dict_to_logprob_entries(logprob_proc.token_alternatives))
             elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
-                backend_logprobs = get_logprobs_from_backend()
-                chunk_logprobs = {'top_logprobs': [backend_logprobs]} if backend_logprobs else None
+                entries, cmpl_logprobs_offset[0] = _get_raw_logprob_entries(cmpl_logprobs_offset[0])
+                chunk_logprobs = format_completion_logprobs(entries) if entries else None
             else:
                 chunk_logprobs = None
 
diff --git a/modules/text_generation.py b/modules/text_generation.py
index c78afe3e..787c1814 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -78,10 +78,13 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
     reply = ''
     is_stream = state['stream']
     if len(all_stop_strings) > 0 and not state['stream']:
+        original_logits_processor = state.get('logits_processor')
         stop_event_ref = state.pop('stop_event', None)
         state = copy.deepcopy(state)
         if stop_event_ref is not None:
             state['stop_event'] = stop_event_ref
+        if original_logits_processor is not None:
+            state['logits_processor'] = original_logits_processor
         state['stream'] = True
 
     # Generate
diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py
index d57020c6..b9918764 100644
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@@ -65,14 +65,16 @@ class LogprobProcessor(LogitsProcessor):
     def __init__(self, logprobs=None):
         self.logprobs = logprobs
         self.token_alternatives = {}
+        self.token_alternatives_history = []
 
     def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
         if self.logprobs is not None:  # 0-5
             log_e_probabilities = F.log_softmax(logits, dim=1)
-            top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
+            top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs)
             top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]]
             top_probs = [float(x) for x in top_values[0]]
             self.token_alternatives = dict(zip(top_tokens, top_probs))
+            self.token_alternatives_history.append(self.token_alternatives)
 
         return logits
 

From a916fb0e5c81449b4644119decfa86958278860d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 14:27:24 -0300
Subject: [PATCH 041/210] API: Preserve mid-conversation system message
 positions

---
 extensions/openai/completions.py | 14 +++++++++++++-
 modules/chat.py                  |  5 ++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 10cdbf42..37e9568a 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -239,12 +239,14 @@ def convert_history(history):
     user_input = ""
     user_input_last = True
     system_message = ""
+    seen_non_system = False
 
     for entry in history:
         content = entry["content"]
         role = entry["role"]
 
         if role == "user":
+            seen_non_system = True
             # Extract text content (images handled by model-specific code)
             content = process_multimodal_content(content)
             user_input = content
@@ -256,6 +258,7 @@ def convert_history(history):
 
             current_message = content
         elif role == "assistant":
+            seen_non_system = True
             meta = {}
             tool_calls = entry.get("tool_calls")
             if tool_calls and isinstance(tool_calls, list) and len(tool_calls) > 0:
@@ -272,13 +275,22 @@ def convert_history(history):
             else:
                 chat_dialogue.append(['', current_reply, '', meta])
         elif role == "tool":
+            seen_non_system = True
             user_input_last = False
             meta = {}
             if "tool_call_id" in entry:
                 meta["tool_call_id"] = entry["tool_call_id"]
             chat_dialogue.append(['', '', content, meta])
         elif role in ("system", "developer"):
-            system_message += f"\n{content}" if system_message else content
+            if not seen_non_system:
+                # Leading system messages go to custom_system_message (placed at top)
+                system_message += f"\n{content}" if system_message else content
+            else:
+                # Mid-conversation system messages: preserve position in history
+                if current_message:
+                    chat_dialogue.append([current_message, '', '', {}])
+                    current_message = ""
+                chat_dialogue.append([content, '', '', {"role": "system"}])
 
     if not user_input_last:
         user_input = ""
diff --git a/modules/chat.py b/modules/chat.py
index 5cf550d6..8af92273 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -315,7 +315,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
                         "tool_call_id": item.get('tool_call_id', '')
                     })
 
-        if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
+        if entry_meta.get('role') == 'system':
+            if user_msg:
+                messages.insert(insert_pos, {"role": "system", "content": user_msg})
+        elif user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
             # Check for user message attachments in metadata
             user_key = f"user_{row_idx}"
             enhanced_user_msg = user_msg

From 2466305f76a7c197178096021eb3f18e0449410a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 16:02:57 -0300
Subject: [PATCH 042/210] Add tool examples

---
 user_data/tools/get_datetime.py | 18 ++++++++++++++++++
 user_data/tools/roll_dice.py    | 23 +++++++++++++++++++++++
 user_data/tools/web_search.py   | 27 +++++++++++++++++++++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 user_data/tools/get_datetime.py
 create mode 100644 user_data/tools/roll_dice.py
 create mode 100644 user_data/tools/web_search.py

diff --git a/user_data/tools/get_datetime.py b/user_data/tools/get_datetime.py
new file mode 100644
index 00000000..f0a92777
--- /dev/null
+++ b/user_data/tools/get_datetime.py
@@ -0,0 +1,18 @@
+from datetime import datetime
+
+tool = {
+    "type": "function",
+    "function": {
+        "name": "get_datetime",
+        "description": "Get the current date and time.",
+        "parameters": {
+            "type": "object",
+            "properties": {},
+        }
+    }
+}
+
+
+def execute(arguments):
+    now = datetime.now()
+    return {"date": now.strftime("%Y-%m-%d"), "time": now.strftime("%I:%M %p")}
diff --git a/user_data/tools/roll_dice.py b/user_data/tools/roll_dice.py
new file mode 100644
index 00000000..9cab48a8
--- /dev/null
+++ b/user_data/tools/roll_dice.py
@@ -0,0 +1,23 @@
+import random
+
+tool = {
+    "type": "function",
+    "function": {
+        "name": "roll_dice",
+        "description": "Roll one or more dice with the specified number of sides.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "count": {"type": "integer", "description": "Number of dice to roll.", "default": 1},
+                "sides": {"type": "integer", "description": "Number of sides per die.", "default": 20},
+            },
+        }
+    }
+}
+
+
+def execute(arguments):
+    count = arguments.get("count", 1)
+    sides = arguments.get("sides", 20)
+    rolls = [random.randint(1, sides) for _ in range(count)]
+    return {"rolls": rolls, "total": sum(rolls)}
diff --git a/user_data/tools/web_search.py b/user_data/tools/web_search.py
new file mode 100644
index 00000000..8923eab0
--- /dev/null
+++ b/user_data/tools/web_search.py
@@ -0,0 +1,27 @@
+from modules.web_search import perform_web_search
+
+tool = {
+    "type": "function",
+    "function": {
+        "name": "web_search",
+        "description": "Search the web using DuckDuckGo and return page contents.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "query": {"type": "string", "description": "The search query."},
+            },
+            "required": ["query"]
+        }
+    }
+}
+
+
+def execute(arguments):
+    query = arguments.get("query", "")
+    results = perform_web_search(query, num_pages=3)
+    output = []
+    for r in results:
+        if r and r["content"].strip():
+            output.append({"title": r["title"], "url": r["url"], "content": r["content"][:4000]})
+
+    return output if output else [{"error": "No results found."}]

From d45c9b3c5957692f2f8900cc0e19a22220bc114a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 16:06:00 -0300
Subject: [PATCH 043/210] API: Minor logprobs fixes

---
 extensions/openai/completions.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 37e9568a..eb1702a8 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -98,7 +98,7 @@ def format_chat_logprobs(entries):
             "top_logprobs": top_list
         })
 
-    return {"content": content} if content else None
+    return {"content": content, "refusal": None} if content else None
 
 
 def format_completion_logprobs(entries):
@@ -174,7 +174,7 @@ def process_parameters(body, is_legacy=False):
     logprobs = body.get('logprobs', None)
     top_logprobs = body.get('top_logprobs', None)
     if logprobs is True:
-        logprobs = top_logprobs if top_logprobs and top_logprobs > 0 else 5
+        logprobs = max(top_logprobs, 1) if top_logprobs is not None else 5
         generate_params['logprobs'] = logprobs
 
     # For llama.cpp and ExLlamav3 native, logit_bias and logprobs are forwarded natively
@@ -677,6 +677,9 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                 if original_seed >= 0:
                     generate_params['seed'] = original_seed + _n
 
+                if logprob_proc:
+                    logprob_proc.token_alternatives_history.clear()
+
                 # generate reply #######################################
                 debug_msg({'prompt': prompt, 'generate_params': generate_params})
                 generator = generate_reply(prompt, generate_params, is_chat=False)

From 2d0cc7726eae0d98e52352b740397d86cdd2973e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 16:29:46 -0300
Subject: [PATCH 044/210] API: Add reasoning_content field to non-streaming
 chat completions

Extract thinking/reasoning blocks (e.g. <think>...</think>) into a
separate reasoning_content field on the assistant message, matching
the convention used by DeepSeek, llama.cpp, and SGLang.
---
 extensions/openai/completions.py | 11 ++++-
 modules/html_generator.py        | 62 ++--------------------------
 modules/reasoning.py             | 71 ++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 60 deletions(-)
 create mode 100644 modules/reasoning.py

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index eb1702a8..0eb0cd27 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -13,6 +13,7 @@ from extensions.openai.errors import InvalidRequestError
 from extensions.openai.typing import ToolDefinition
 from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
 from modules import shared
+from modules.reasoning import extract_reasoning
 from modules.chat import (
     generate_chat_prompt,
     generate_chat_reply,
@@ -553,6 +554,14 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         else:
             yield chunk
     else:
+        reasoning, content = extract_reasoning(answer) if not tool_calls else (None, answer)
+        message = {
+            "role": "assistant",
+            "refusal": None,
+            "content": None if tool_calls else content,
+            **({"reasoning_content": reasoning} if reasoning else {}),
+            **({"tool_calls": tool_calls} if tool_calls else {}),
+        }
         resp = {
             "id": cmpl_id,
             "object": object_type,
@@ -562,7 +571,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             resp_list: [{
                 "index": 0,
                 "finish_reason": stop_reason,
-                "message": {"role": "assistant", "refusal": None, "content": None if tool_calls else answer, **({"tool_calls": tool_calls} if tool_calls else {})},
+                "message": message,
                 "logprobs": None,
             }],
             "usage": {
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 472a9ea0..4d9904fb 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -10,6 +10,7 @@ import markdown
 from PIL import Image, ImageOps
 
 from modules import shared
+from modules.reasoning import extract_reasoning
 from modules.sane_markdown_lists import SaneListExtension
 from modules.utils import get_available_chat_styles
 
@@ -108,66 +109,9 @@ def replace_blockquote(m):
     return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
 
 
-# Thinking block format definitions: (start_tag, end_tag, content_start_tag)
-# Use None for start_tag to match from beginning (end-only formats should be listed last)
-THINKING_FORMATS = [
-    ('<think>', '</think>', None),
-    ('<|channel|>analysis<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'),
-    ('<seed:think>', '</seed:think>', None),
-    ('<|think|>', '<|end|>', '<|content|>'),  # Solar Open
-    ('Thinking Process:', '</think>', None),  # Qwen3.5 verbose thinking outside tags
-    (None, '</think>', None),  # End-only variant (e.g., Qwen3-next)
-]
-
-
 def extract_thinking_block(string):
-    """Extract thinking blocks from the beginning of a string."""
-    if not string:
-        return None, string
-
-    for start_tag, end_tag, content_tag in THINKING_FORMATS:
-        end_esc = html.escape(end_tag)
-        content_esc = html.escape(content_tag) if content_tag else None
-
-        if start_tag is None:
-            # End-only format: require end tag, start from beginning
-            end_pos = string.find(end_esc)
-            if end_pos == -1:
-                continue
-            thought_start = 0
-        else:
-            # Normal format: require start tag
-            start_esc = html.escape(start_tag)
-            start_pos = string.find(start_esc)
-            if start_pos == -1:
-                continue
-            thought_start = start_pos + len(start_esc)
-            end_pos = string.find(end_esc, thought_start)
-
-        if end_pos == -1:
-            # End tag missing - check if content tag can serve as fallback
-            if content_esc:
-                content_pos = string.find(content_esc, thought_start)
-                if content_pos != -1:
-                    thought_end = content_pos
-                    content_start = content_pos + len(content_esc)
-                else:
-                    thought_end = len(string)
-                    content_start = len(string)
-            else:
-                thought_end = len(string)
-                content_start = len(string)
-        else:
-            thought_end = end_pos
-            if content_esc:
-                content_pos = string.find(content_esc, end_pos)
-                content_start = content_pos + len(content_esc) if content_pos != -1 else end_pos + len(end_esc)
-            else:
-                content_start = end_pos + len(end_esc)
-
-        return string[thought_start:thought_end], string[content_start:]
-
-    return None, string
+    """Extract thinking blocks from the beginning of an HTML-escaped string."""
+    return extract_reasoning(string, html_escaped=True)
 
 
 def build_thinking_block(thinking_content, message_id, has_remaining_content):
diff --git a/modules/reasoning.py b/modules/reasoning.py
new file mode 100644
index 00000000..12f8553d
--- /dev/null
+++ b/modules/reasoning.py
@@ -0,0 +1,71 @@
+import html as html_module
+
+# Thinking block format definitions: (start_tag, end_tag, content_start_tag)
+# Use None for start_tag to match from beginning (end-only formats should be listed last)
+THINKING_FORMATS = [
+    ('<think>', '</think>', None),
+    ('<|channel|>analysis<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'),
+    ('<seed:think>', '</seed:think>', None),
+    ('<|think|>', '<|end|>', '<|content|>'),  # Solar Open
+    ('Thinking Process:', '</think>', None),  # Qwen3.5 verbose thinking outside tags
+    (None, '</think>', None),  # End-only variant (e.g., Qwen3-next)
+]
+
+
+def extract_reasoning(text, html_escaped=False):
+    """Extract reasoning/thinking blocks from the beginning of a string.
+
+    When html_escaped=True, tags are HTML-escaped before searching
+    (for use on already-escaped UI strings).
+
+    Returns (reasoning_content, final_content) where reasoning_content is
+    None if no thinking block is found.
+    """
+    if not text:
+        return None, text
+
+    esc = html_module.escape if html_escaped else lambda s: s
+
+    for start_tag, end_tag, content_tag in THINKING_FORMATS:
+        end_esc = esc(end_tag)
+        content_esc = esc(content_tag) if content_tag else None
+
+        if start_tag is None:
+            # End-only format: require end tag, start from beginning
+            end_pos = text.find(end_esc)
+            if end_pos == -1:
+                continue
+            thought_start = 0
+        else:
+            # Normal format: require start tag
+            start_esc = esc(start_tag)
+            start_pos = text.find(start_esc)
+            if start_pos == -1:
+                continue
+            thought_start = start_pos + len(start_esc)
+            end_pos = text.find(end_esc, thought_start)
+
+        if end_pos == -1:
+            # End tag missing - check if content tag can serve as fallback
+            if content_esc:
+                content_pos = text.find(content_esc, thought_start)
+                if content_pos != -1:
+                    thought_end = content_pos
+                    content_start = content_pos + len(content_esc)
+                else:
+                    thought_end = len(text)
+                    content_start = len(text)
+            else:
+                thought_end = len(text)
+                content_start = len(text)
+        else:
+            thought_end = end_pos
+            if content_esc:
+                content_pos = text.find(content_esc, end_pos)
+                content_start = content_pos + len(content_esc) if content_pos != -1 else end_pos + len(end_esc)
+            else:
+                content_start = end_pos + len(end_esc)
+
+        return text[thought_start:thought_end], text[content_start:]
+
+    return None, text

From 9a7428b6271ffadf2094f93dd1e385c7c8444d71 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 14:16:04 -0700
Subject: [PATCH 045/210] UI: Add collapsible accordions for tool calling steps

---
 extensions/openai/utils.py |  93 +++++++++++++++++++++++----------
 modules/chat.py            |  66 ++++++++++++++++++++----
 modules/html_generator.py  | 102 +++++++++++++++++++++++++++++++------
 3 files changed, 207 insertions(+), 54 deletions(-)

diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py
index f4a31d1a..6fb05f08 100644
--- a/extensions/openai/utils.py
+++ b/extensions/openai/utils.py
@@ -123,6 +123,7 @@ def _parseChannelToolCalls(answer: str, tool_names: list[str]):
         <|channel|>commentary to=functions.func_name <|constrain|>json<|message|>{"arg": "value"}
     """
     matches = []
+    start_pos = None
     for m in re.finditer(
         r'<\|channel\|>commentary to=functions\.([^<\s]+)\s*(?:<\|constrain\|>json)?<\|message\|>',
         answer
@@ -135,6 +136,8 @@ def _parseChannelToolCalls(answer: str, tool_names: list[str]):
             continue
         try:
             arguments = json.loads(json_str)
+            if start_pos is None:
+                start_pos = m.start()
             matches.append({
                 "type": "function",
                 "function": {
@@ -144,7 +147,7 @@ def _parseChannelToolCalls(answer: str, tool_names: list[str]):
             })
         except json.JSONDecodeError:
             pass
-    return matches
+    return matches, start_pos
 
 
 def _parseBareNameToolCalls(answer: str, tool_names: list[str]):
@@ -155,6 +158,7 @@ def _parseBareNameToolCalls(answer: str, tool_names: list[str]):
     Multiple calls are concatenated directly or separated by whitespace.
     """
     matches = []
+    start_pos = None
     # Match tool name followed by opening brace, then extract balanced JSON
     escaped_names = [re.escape(name) for name in tool_names]
     pattern = r'(?:' + '|'.join(escaped_names) + r')\s*\{'
@@ -173,6 +177,8 @@ def _parseBareNameToolCalls(answer: str, tool_names: list[str]):
             continue
         try:
             arguments = json.loads(json_str)
+            if start_pos is None:
+                start_pos = match.start()
             matches.append({
                 "type": "function",
                 "function": {
@@ -182,7 +188,7 @@ def _parseBareNameToolCalls(answer: str, tool_names: list[str]):
             })
         except json.JSONDecodeError:
             pass
-    return matches
+    return matches, start_pos
 
 
 def _parseXmlParamToolCalls(answer: str, tool_names: list[str]):
@@ -196,6 +202,7 @@ def _parseXmlParamToolCalls(answer: str, tool_names: list[str]):
         </tool_call>
     """
     matches = []
+    start_pos = None
     for tc_match in re.finditer(r'<tool_call>\s*(.*?)\s*</tool_call>', answer, re.DOTALL):
         tc_content = tc_match.group(1)
         func_match = re.search(r'<function=([^>]+)>', tc_content)
@@ -213,6 +220,8 @@ def _parseXmlParamToolCalls(answer: str, tool_names: list[str]):
             except (json.JSONDecodeError, ValueError):
                 pass  # keep as string
             arguments[param_name] = param_value
+        if start_pos is None:
+            start_pos = tc_match.start()
         matches.append({
             "type": "function",
             "function": {
@@ -220,7 +229,7 @@ def _parseXmlParamToolCalls(answer: str, tool_names: list[str]):
                 "arguments": arguments
             }
         })
-    return matches
+    return matches, start_pos
 
 
 def _parseKimiToolCalls(answer: str, tool_names: list[str]):
@@ -232,6 +241,7 @@ def _parseKimiToolCalls(answer: str, tool_names: list[str]):
         <|tool_calls_section_end|>
     """
     matches = []
+    start_pos = None
     for m in re.finditer(
         r'<\|tool_call_begin\|>\s*(?:functions\.)?(\S+?)(?::\d+)?\s*<\|tool_call_argument_begin\|>\s*',
         answer
@@ -244,6 +254,10 @@ def _parseKimiToolCalls(answer: str, tool_names: list[str]):
             continue
         try:
             arguments = json.loads(json_str)
+            if start_pos is None:
+                # Check for section begin marker before the call marker
+                section = answer.rfind('<|tool_calls_section_begin|>', 0, m.start())
+                start_pos = section if section != -1 else m.start()
             matches.append({
                 "type": "function",
                 "function": {
@@ -253,7 +267,7 @@ def _parseKimiToolCalls(answer: str, tool_names: list[str]):
             })
         except json.JSONDecodeError:
             pass
-    return matches
+    return matches, start_pos
 
 
 def _parseMiniMaxToolCalls(answer: str, tool_names: list[str]):
@@ -267,6 +281,7 @@ def _parseMiniMaxToolCalls(answer: str, tool_names: list[str]):
         </minimax:tool_call>
     """
     matches = []
+    start_pos = None
     for tc_match in re.finditer(r'<minimax:tool_call>\s*(.*?)\s*</minimax:tool_call>', answer, re.DOTALL):
         tc_content = tc_match.group(1)
         # Split on <invoke> to handle multiple parallel calls in one block
@@ -284,6 +299,8 @@ def _parseMiniMaxToolCalls(answer: str, tool_names: list[str]):
                 except (json.JSONDecodeError, ValueError):
                     pass  # keep as string
                 arguments[param_name] = param_value
+            if start_pos is None:
+                start_pos = tc_match.start()
             matches.append({
                 "type": "function",
                 "function": {
@@ -291,7 +308,7 @@ def _parseMiniMaxToolCalls(answer: str, tool_names: list[str]):
                     "arguments": arguments
                 }
             })
-    return matches
+    return matches, start_pos
 
 
 def _parseDeepSeekToolCalls(answer: str, tool_names: list[str]):
@@ -301,6 +318,7 @@ def _parseDeepSeekToolCalls(answer: str, tool_names: list[str]):
         <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>func_name<｜tool▁sep｜>{"arg": "value"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>
     """
     matches = []
+    start_pos = None
     for m in re.finditer(
         r'<｜tool▁call▁begin｜>\s*(\S+?)\s*<｜tool▁sep｜>\s*',
         answer
@@ -313,6 +331,10 @@ def _parseDeepSeekToolCalls(answer: str, tool_names: list[str]):
             continue
         try:
             arguments = json.loads(json_str)
+            if start_pos is None:
+                # Check for section begin marker before the call marker
+                section = answer.rfind('<｜tool▁calls▁begin｜>', 0, m.start())
+                start_pos = section if section != -1 else m.start()
             matches.append({
                 "type": "function",
                 "function": {
@@ -322,7 +344,7 @@ def _parseDeepSeekToolCalls(answer: str, tool_names: list[str]):
             })
         except json.JSONDecodeError:
             pass
-    return matches
+    return matches, start_pos
 
 
 def _parseGlmToolCalls(answer: str, tool_names: list[str]):
@@ -335,6 +357,7 @@ def _parseGlmToolCalls(answer: str, tool_names: list[str]):
         </tool_call>
     """
     matches = []
+    start_pos = None
     for tc_match in re.finditer(r'<tool_call>\s*(.*?)\s*</tool_call>', answer, re.DOTALL):
         tc_content = tc_match.group(1)
         # First non-tag text is the function name
@@ -356,6 +379,8 @@ def _parseGlmToolCalls(answer: str, tool_names: list[str]):
             except (json.JSONDecodeError, ValueError):
                 pass  # keep as string
             arguments[k] = v
+        if start_pos is None:
+            start_pos = tc_match.start()
         matches.append({
             "type": "function",
             "function": {
@@ -363,7 +388,7 @@ def _parseGlmToolCalls(answer: str, tool_names: list[str]):
                 "arguments": arguments
             }
         })
-    return matches
+    return matches, start_pos
 
 
 def _parsePythonicToolCalls(answer: str, tool_names: list[str]):
@@ -373,10 +398,11 @@ def _parsePythonicToolCalls(answer: str, tool_names: list[str]):
         [func_name(param1="value1", param2="value2"), func_name2(...)]
     """
     matches = []
+    start_pos = None
     # Match a bracketed list of function calls
     bracket_match = re.search(r'\[([^\[\]]+)\]', answer)
     if not bracket_match:
-        return matches
+        return matches, start_pos
 
     inner = bracket_match.group(1)
 
@@ -411,6 +437,8 @@ def _parsePythonicToolCalls(answer: str, tool_names: list[str]):
                     pass
                 arguments[param_name] = param_value
 
+        if start_pos is None:
+            start_pos = bracket_match.start()
         matches.append({
             "type": "function",
             "function": {
@@ -419,55 +447,62 @@ def _parsePythonicToolCalls(answer: str, tool_names: list[str]):
             }
         })
 
-    return matches
+    return matches, start_pos
 
 
-def parseToolCall(answer: str, tool_names: list[str]):
+def parseToolCall(answer: str, tool_names: list[str], return_prefix: bool = False):
     matches = []
+    start_pos = None
+
+    def _return(matches, start_pos):
+        if return_prefix:
+            prefix = answer[:start_pos] if matches and start_pos is not None else ''
+            return matches, prefix
+        return matches
 
     # abort on very short answers to save computation cycles
     if len(answer) < 10:
-        return matches
+        return _return(matches, start_pos)
 
     # Check for DeepSeek-style tool calls (fullwidth Unicode token delimiters)
-    matches = _parseDeepSeekToolCalls(answer, tool_names)
+    matches, start_pos = _parseDeepSeekToolCalls(answer, tool_names)
     if matches:
-        return matches
+        return _return(matches, start_pos)
 
     # Check for Kimi-K2-style tool calls (pipe-delimited tokens)
-    matches = _parseKimiToolCalls(answer, tool_names)
+    matches, start_pos = _parseKimiToolCalls(answer, tool_names)
     if matches:
-        return matches
+        return _return(matches, start_pos)
 
     # Check for channel-based tool calls (e.g. GPT-OSS format)
-    matches = _parseChannelToolCalls(answer, tool_names)
+    matches, start_pos = _parseChannelToolCalls(answer, tool_names)
     if matches:
-        return matches
+        return _return(matches, start_pos)
 
     # Check for MiniMax-style tool calls (invoke/parameter XML tags)
-    matches = _parseMiniMaxToolCalls(answer, tool_names)
+    matches, start_pos = _parseMiniMaxToolCalls(answer, tool_names)
     if matches:
-        return matches
+        return _return(matches, start_pos)
 
     # Check for GLM-style tool calls (arg_key/arg_value XML pairs)
-    matches = _parseGlmToolCalls(answer, tool_names)
+    matches, start_pos = _parseGlmToolCalls(answer, tool_names)
     if matches:
-        return matches
+        return _return(matches, start_pos)
 
     # Check for XML-parameter style tool calls (e.g. Qwen3.5 format)
-    matches = _parseXmlParamToolCalls(answer, tool_names)
+    matches, start_pos = _parseXmlParamToolCalls(answer, tool_names)
     if matches:
-        return matches
+        return _return(matches, start_pos)
 
     # Check for bare function-name style tool calls (e.g. Mistral format)
-    matches = _parseBareNameToolCalls(answer, tool_names)
+    matches, start_pos = _parseBareNameToolCalls(answer, tool_names)
     if matches:
-        return matches
+        return _return(matches, start_pos)
 
     # Check for pythonic-style tool calls (e.g. Llama 4 format)
-    matches = _parsePythonicToolCalls(answer, tool_names)
+    matches, start_pos = _parsePythonicToolCalls(answer, tool_names)
     if matches:
-        return matches
+        return _return(matches, start_pos)
 
     # Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
     patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
@@ -501,6 +536,8 @@ def parseToolCall(answer: str, tool_names: list[str]):
             for candidate_dict in candidates:
                 checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
                 if checked_candidate is not None:
+                    if start_pos is None:
+                        start_pos = match.start()
                     matches.append(checked_candidate)
 
         # last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags
@@ -524,4 +561,4 @@ def parseToolCall(answer: str, tool_names: list[str]):
                 # Ignore invalid JSON silently
                 pass
 
-    return matches
+    return _return(matches, start_pos)
diff --git a/modules/chat.py b/modules/chat.py
index 8af92273..d3cd0cae 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -23,6 +23,7 @@ from modules.extensions import apply_extensions
 from modules.html_generator import (
     chat_html_wrapper,
     convert_to_markdown,
+    extract_thinking_block,
     make_thumbnail
 )
 from modules.image_utils import open_image_safely
@@ -1168,12 +1169,19 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         cur_text = text if _tool_turn == 0 else ''
 
         for i, history in enumerate(generate_chat_reply(cur_text, state, regen, cont, loading_message=True, for_ui=True)):
-            # Prepend accumulated tool output to visible reply
+            # Prepend accumulated tool output to visible reply for display.
+            # Save and restore the original to prevent the markers from leaking
+            # back into chatbot_wrapper's shared output object, which would cause
+            # duplication on the next yield.
+            _original_visible = history['visible'][-1][1] if visible_prefix else None
             if visible_prefix:
-                history['visible'][-1][1] = '\n\n'.join(visible_prefix + [history['visible'][-1][1]])
+                history['visible'][-1][1] = '\n\n'.join(visible_prefix + [_original_visible])
 
             yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'], last_message_only=(i > 0)), history
 
+            if visible_prefix:
+                history['visible'][-1][1] = _original_visible
+
             if i == 0:
                 # Save old tool_sequence into version 0 (created by chatbot_wrapper
                 # on the first yield).  Only needed on the first regeneration when
@@ -1196,6 +1204,15 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
             if tool_func_names and parseToolCall(history['internal'][-1][1], tool_func_names):
                 break
 
+        # Save the model's visible output before re-applying visible_prefix,
+        # so we can extract thinking content from just this turn's output.
+        _model_visible = history['visible'][-1][1]
+
+        # Re-apply visible prefix to the final state after streaming completes.
+        # This is safe because we're no longer sharing the object with chatbot_wrapper.
+        if visible_prefix:
+            history['visible'][-1][1] = '\n\n'.join(visible_prefix + [_model_visible])
+
         save_history(history, state['unique_id'], state['character_menu'], state['mode'])
 
         # Check for tool calls
@@ -1203,7 +1220,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
             break
 
         answer = history['internal'][-1][1]
-        parsed_calls = parseToolCall(answer, tool_func_names) if answer else None
+        parsed_calls, content_prefix = parseToolCall(answer, tool_func_names, return_prefix=True) if answer else (None, '')
 
         if not parsed_calls:
             break  # No tool calls — done
@@ -1232,12 +1249,38 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         # Clear internal (raw tool markup)
         history['internal'][-1][1] = ''
 
-        # Add call summary to visible prefix
-        call_summary = ', '.join(f'{tc["function"]["name"]}(...)' for tc in parsed_calls)
-        visible_prefix.append('Calling: ' + call_summary)
+        # Preserve thinking block and intermediate text from this turn.
+        # content_prefix is the raw text before tool call syntax (returned
+        # by parseToolCall); HTML-escape it and extract thinking to get
+        # the content the user should see.
+        content_text = html.escape(content_prefix)
+        thinking_content, intermediate = extract_thinking_block(content_text)
+        if thinking_content:
+            visible_prefix.append(f'&lt;think&gt;\n{thinking_content}\n&lt;/think&gt;')
+        if intermediate and intermediate.strip():
+            visible_prefix.append(intermediate.strip())
 
-        # Execute tools, store results
+        # Build args summaries and show placeholder accordions with "..."
+        # before execution starts (tool calls may be slow, e.g. web search).
+        tc_headers = []
         for tc in parsed_calls:
+            fn_name = tc['function']['name']
+            fn_args = tc['function'].get('arguments', {})
+            if isinstance(fn_args, dict) and fn_args:
+                args_summary = ', '.join(f'{k}={json.dumps(v, ensure_ascii=False)}' for k, v in fn_args.items())
+            elif isinstance(fn_args, dict):
+                args_summary = ''
+            else:
+                args_summary = str(fn_args)
+
+            tc_headers.append(f'{fn_name}({args_summary})')
+
+        pending_placeholders = [f'<tool_call>{h}\n...\n</tool_call>' for h in tc_headers]
+        history['visible'][-1][1] = '\n\n'.join(visible_prefix + pending_placeholders)
+        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
+
+        # Execute tools, store results, and replace placeholders with real results
+        for i, tc in enumerate(parsed_calls):
             fn_name = tc['function']['name']
             fn_args = tc['function'].get('arguments', {})
             result = execute_tool(fn_name, fn_args, tool_executors)
@@ -1248,11 +1291,14 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
             except (json.JSONDecodeError, TypeError):
                 pretty_result = result
 
-            visible_prefix.append(f'**{fn_name}**\n```json\n{pretty_result}\n```')
+            # Replace the placeholder with the real result
+            pending_placeholders[i] = f'<tool_call>{tc_headers[i]}\n{pretty_result}\n</tool_call>'
+            history['visible'][-1][1] = '\n\n'.join(visible_prefix + pending_placeholders)
+            yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
 
-        # Show tool results
+        # Move completed tool calls into visible_prefix for next turns
+        visible_prefix.extend(pending_placeholders)
         history['visible'][-1][1] = '\n\n'.join(visible_prefix)
-        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
         save_history(history, state['unique_id'], state['character_menu'], state['mode'])
 
         state['history'] = history
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 4d9904fb..f3811602 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -114,7 +114,36 @@ def extract_thinking_block(string):
     return extract_reasoning(string, html_escaped=True)
 
 
-def build_thinking_block(thinking_content, message_id, has_remaining_content):
+
+def build_tool_call_block(header, body, message_id, index):
+    """Build HTML for a tool call accordion block."""
+    block_id = f"tool-call-{message_id}-{index}"
+
+    if body == '...':
+        # Pending placeholder — no expandable body, just title with ellipsis
+        return f'''
+        <details class="thinking-block" data-block-id="{block_id}">
+            <summary class="thinking-header">
+                {tool_svg_small}
+                <span class="thinking-title">{html.escape(header)} ...</span>
+            </summary>
+        </details>
+        '''
+
+    # Build a plain <pre> directly to avoid highlight.js auto-detection
+    escaped_body = html.escape(body)
+    return f'''
+    <details class="thinking-block" data-block-id="{block_id}">
+        <summary class="thinking-header">
+            {tool_svg_small}
+            <span class="thinking-title">{html.escape(header)}</span>
+        </summary>
+        <div class="thinking-content pretty_scrollbar"><pre><code class="nohighlight">{escaped_body}</code></pre></div>
+    </details>
+    '''
+
+
+def build_thinking_block(thinking_content, message_id, has_remaining_content, thinking_index=0):
     """Build HTML for a thinking block."""
     if thinking_content is None:
         return None
@@ -123,7 +152,7 @@ def build_thinking_block(thinking_content, message_id, has_remaining_content):
     thinking_html = process_markdown_content(thinking_content)
 
     # Generate unique ID for the thinking block
-    block_id = f"thinking-{message_id}-0"
+    block_id = f"thinking-{message_id}-{thinking_index}"
 
     # Check if thinking is complete or still in progress
     is_streaming = not has_remaining_content
@@ -304,24 +333,64 @@ def convert_to_markdown(string, message_id=None):
     if message_id is None:
         message_id = "unknown"
 
-    # Extract different components from the string
-    thinking_content, remaining_content = extract_thinking_block(string)
+    # Find tool call blocks by position, then process the text segments
+    # between them using extract_thinking_block (which supports all
+    # THINKING_FORMATS, including end-only variants like Qwen's).
+    tool_call_pattern = re.compile(r'<tool_call>(.*?)\n(.*?)\n</tool_call>', re.DOTALL)
+    tool_calls = list(tool_call_pattern.finditer(string))
 
-    # Build individual HTML blocks
-    blocks = []
+    if not tool_calls:
+        # No tool calls — use original single-pass extraction
+        thinking_content, remaining_content = extract_thinking_block(string)
+        blocks = []
+        thinking_html = build_thinking_block(thinking_content, message_id, bool(remaining_content))
+        if thinking_html:
+            blocks.append(thinking_html)
 
-    # Add thinking block if present
-    thinking_html = build_thinking_block(thinking_content, message_id, bool(remaining_content))
-    if thinking_html:
-        blocks.append(thinking_html)
+        main_html = build_main_content_block(remaining_content)
+        if main_html:
+            blocks.append(main_html)
 
-    # Add main content block
-    main_html = build_main_content_block(remaining_content)
-    if main_html:
-        blocks.append(main_html)
+        return ''.join(blocks)
 
-    # Assemble all blocks into final HTML
-    return ''.join(blocks)
+    # Split string into text segments around tool_call blocks and
+    # run extract_thinking_block on each segment for full format support.
+    html_parts = []
+    last_end = 0
+    tool_idx = 0
+    think_idx = 0
+
+    def process_text_segment(text, is_last_segment):
+        """Process a text segment between tool_call blocks for thinking content."""
+        nonlocal think_idx
+        if not text.strip():
+            return
+
+        thinking_content, remaining = extract_thinking_block(text)
+        if thinking_content is not None:
+            has_remaining = bool(remaining.strip()) or not is_last_segment
+            html_parts.append(build_thinking_block(thinking_content, message_id, has_remaining, think_idx))
+            think_idx += 1
+            text = remaining
+
+        if text.strip():
+            html_parts.append(process_markdown_content(text))
+
+    for tc in tool_calls:
+        # Process text before this tool_call
+        process_text_segment(string[last_end:tc.start()], is_last_segment=False)
+
+        # Add tool call accordion
+        header = tc.group(1).strip()
+        body = tc.group(2).strip()
+        html_parts.append(build_tool_call_block(header, body, message_id, tool_idx))
+        tool_idx += 1
+        last_end = tc.end()
+
+    # Process text after the last tool_call
+    process_text_segment(string[last_end:], is_last_segment=True)
+
+    return ''.join(html_parts)
 
 
 def convert_to_markdown_wrapped(string, message_id=None, use_cache=True):
@@ -379,6 +448,7 @@ branch_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="24"  height="24
 edit_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-pencil"><path d="M4 20h4l10.5 -10.5a2.828 2.828 0 1 0 -4 -4l-10.5 10.5v4"></path><path d="M13.5 6.5l4 4"></path></svg>'''
 info_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
 info_svg_small = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
+tool_svg_small = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-tool"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 10h3v-3l-3.5 -3.5a6 6 0 0 1 8 8l6 6a2 2 0 0 1 -3 3l-6 -6a6 6 0 0 1 -8 -8l3.5 3.5" /></svg>'''
 attachment_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21.44 11.05l-9.19 9.19a6 6 0 0 1-8.48-8.48l9.19-9.19a4 4 0 0 1 5.66 5.66l-9.2 9.19a2 2 0 0 1-2.83-2.83l8.49-8.48"></path></svg>'''
 
 copy_button = f'<button class="footer-button footer-copy-button" title="Copy" onclick="copyToClipboard(this)">{copy_svg}</button>'

From 3e6bd1a310217c89fa5dff23c34e92ae43f1acb9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 14:30:51 -0700
Subject: [PATCH 046/210] UI: Prepend thinking tag when template appends it to
 prompt

Makes Qwen models have a thinking block straight away during streaming.
---
 modules/chat.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index d3cd0cae..e4d5dd30 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -28,6 +28,7 @@ from modules.html_generator import (
 )
 from modules.image_utils import open_image_safely
 from modules.logging_colors import logger
+from modules.reasoning import THINKING_FORMATS
 from modules.text_generation import (
     generate_reply,
     get_encoded_length,
@@ -986,10 +987,23 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     # Add timestamp for assistant's response at the start of generation
     update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp(), model_name=shared.model_name)
 
+    # Detect if the template appended a thinking start tag to the prompt
+    thinking_prefix = None
+    if not _continue:
+        stripped_prompt = prompt.rstrip('\n')
+        for start_tag, end_tag, content_tag in THINKING_FORMATS:
+            if start_tag is not None and stripped_prompt.endswith(start_tag):
+                thinking_prefix = start_tag
+                break
+
     # Generate
     reply = None
     for j, reply in enumerate(generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True, for_ui=for_ui)):
 
+        # Prepend thinking tag if the template appended it to the prompt
+        if thinking_prefix:
+            reply = thinking_prefix + reply
+
         # Extract the reply
         if state['mode'] in ['chat', 'chat-instruct']:
             if not _continue:

From bbd43d9463cded0aae3ce4e1ced1519693bebf9a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 14:54:05 -0700
Subject: [PATCH 047/210] UI: Correctly propagate truncation_length when
 ctx_size is auto

---
 modules/llama_cpp_server.py | 8 +++++++-
 modules/models.py           | 2 ++
 modules/ui_model_menu.py    | 6 +++++-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index a3e431ac..192aa9e4 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -36,6 +36,7 @@ class LlamaServer:
         self.process = None
         self.session = requests.Session()
         self.vocabulary_size = None
+        self.n_ctx = None
         self.bos_token = "<s>"
         self.last_prompt_token_count = 0
 
@@ -320,12 +321,17 @@ class LlamaServer:
                 self.vocabulary_size = model_info["meta"]["n_vocab"]
 
     def _get_bos_token(self):
-        """Get and store the model's BOS token."""
+        """Get and store the model's BOS token and context size."""
         url = f"http://127.0.0.1:{self.port}/props"
         response = self.session.get(url).json()
         if "bos_token" in response:
             self.bos_token = response["bos_token"]
 
+        # Get actual n_ctx from the server (important when --fit auto-selects it)
+        n_ctx = response.get("default_generation_settings", {}).get("n_ctx")
+        if n_ctx:
+            self.n_ctx = n_ctx
+
     def _is_port_available(self, port):
         """Check if a port is available for use."""
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
diff --git a/modules/models.py b/modules/models.py
index 48d68b0b..d83b98d7 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -54,6 +54,8 @@ def load_model(model_name, loader=None):
     if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp':
         if shared.args.ctx_size > 0:
             shared.settings['truncation_length'] = shared.args.ctx_size
+        elif loader == 'llama.cpp' and hasattr(model, 'n_ctx') and model.n_ctx:
+            shared.settings['truncation_length'] = model.n_ctx
 
     shared.is_multimodal = False
     if loader.lower() in ('exllamav3', 'llama.cpp') and hasattr(model, 'is_multimodal'):
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 7e91f1ce..5c83096f 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -388,7 +388,11 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
 def update_truncation_length(current_length, state):
     if 'loader' in state:
         if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp':
-            return state['ctx_size']
+            if state['ctx_size'] > 0:
+                return state['ctx_size']
+
+            # ctx_size == 0 means auto: use the actual value from the server
+            return shared.settings['truncation_length']
 
     return current_length
 

From 4f82b71ef3cfd104cf88c50a7b427063fc2c6094 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 14:56:35 -0700
Subject: [PATCH 048/210] UI: Bump the ctx-size max from 131072 to 262144
 (256K)

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 5c83096f..d17f586b 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -42,7 +42,7 @@ def create_ui():
                     with gr.Row():
                         with gr.Column():
                             shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=-1, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Number of layers to offload to the GPU. -1 = auto.')
-                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=0, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. llama.cpp: 0 = auto if gpu-layers is also -1. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
+                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=0, maximum=262144, step=256, value=shared.args.ctx_size, info='Context length. llama.cpp: 0 = auto if gpu-layers is also -1. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')

From fdd8e5b1fd92de8d0ee20e8166b88c8356d13c52 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 15:48:50 -0700
Subject: [PATCH 049/210] Make repeated Ctrl+C force a shutdown

---
 server.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/server.py b/server.py
index 73f190b6..340f7126 100644
--- a/server.py
+++ b/server.py
@@ -33,6 +33,10 @@ warnings.filterwarnings('ignore', category=UserWarning, message='Field "model_na
 
 
 def signal_handler(sig, frame):
+    # On second Ctrl+C, force an immediate exit
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
+    signal.signal(signal.SIGTERM, signal.SIG_DFL)
+
     logger.info("Received Ctrl+C. Shutting down Text Generation Web UI gracefully.")
 
     # Explicitly stop LlamaServer to avoid __del__ cleanup issues during shutdown

From 09d5e049d672e6bddbf6aec402a8b38cd97e074c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 16:53:49 -0700
Subject: [PATCH 050/210] UI: Improve the Tools checkbox list style

---
 css/main.css       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 js/main.js         | 17 ++++++++++
 modules/ui_chat.py |  5 +--
 3 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index 2d19b5c8..a7069f33 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1814,3 +1814,86 @@ tr + tr th { border-top: 1px solid; }
 
 thead + tbody tr:first-child td,
 thead + tbody tr:first-child th { border-top: 1px solid; }
+
+/* ------------------------------------------------
+   Tools CheckboxGroup - vertical DragDrop-like style
+   ------------------------------------------------ */
+
+/* "Refresh list" link in the Tools label */
+.tools-refresh-link {
+    cursor: pointer;
+}
+
+/* Checkbox list container */
+#tools-group {
+    padding: 0 !important;
+    border-width: 0 !important;
+    background: transparent !important;
+    min-height: 0 !important;
+}
+
+#tools-group .wrap {
+    display: flex;
+    flex-direction: column;
+    flex-wrap: nowrap;
+    gap: 4px;
+    padding: 0;
+    margin-top: var(--spacing-lg);
+    max-height: 350px;
+    overflow-y: auto;
+}
+
+/* Pretty scrollbar for the tools list */
+#tools-group .wrap::-webkit-scrollbar {
+    width: 8px;
+    height: 8px;
+}
+
+#tools-group .wrap::-webkit-scrollbar-track {
+    background: transparent;
+}
+
+#tools-group .wrap::-webkit-scrollbar-thumb,
+#tools-group .wrap::-webkit-scrollbar-thumb:hover {
+    background: var(--neutral-300);
+    border-radius: 30px;
+}
+
+.dark #tools-group .wrap::-webkit-scrollbar-thumb,
+.dark #tools-group .wrap::-webkit-scrollbar-thumb:hover {
+    background: rgb(255 255 255 / 6.25%);
+    border-radius: 10px;
+}
+
+#tools-group .wrap::-webkit-scrollbar-corner {
+    background: transparent;
+}
+
+/* Each checkbox item */
+#tools-group label {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    padding: 5px 8px;
+    border-radius: var(--radius-sm, 4px);
+    background: var(--block-background-fill);
+    border: 1px solid var(--border-color-primary);
+    color: var(--body-text-color);
+    font-size: var(--input-text-size);
+    font-weight: var(--input-text-weight);
+    cursor: pointer;
+    user-select: none;
+    transition: border-color 0.15s ease, background 0.15s ease;
+    box-shadow: none;
+}
+
+#tools-group label:hover {
+    border-color: var(--input-border-color-focus);
+}
+
+#tools-group label span {
+    flex: 1;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+}
diff --git a/js/main.js b/js/main.js
index a7dc77bb..0bc76b2a 100644
--- a/js/main.js
+++ b/js/main.js
@@ -303,6 +303,23 @@ for(i = 0; i < scrollbarElements.length; i++) {
   scrollbarElements[i].style.resize = "none";
 }
 
+
+//------------------------------------------------
+// Tools: inject "Refresh list" link into the label
+//------------------------------------------------
+const toolsTitle = document.querySelector("#tools-group > [data-testid='block-info']");
+const toolsInfo = toolsTitle ? toolsTitle.nextElementSibling : null;
+if (toolsInfo) {
+  const refreshLink = document.createElement("span");
+  refreshLink.textContent = " [Refresh list]";
+  refreshLink.className = "tools-refresh-link";
+  refreshLink.addEventListener("click", function(e) {
+    e.preventDefault();
+    document.querySelector("#tools-refresh-btn").click();
+  });
+  toolsInfo.appendChild(refreshLink);
+}
+
 //------------------------------------------------
 // Remove some backgrounds
 //------------------------------------------------
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index d5b13094..8112d956 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -92,8 +92,9 @@ def create_ui():
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
                 from modules.tool_use import get_available_tools
-                shared.gradio['selected_tools'] = gr.CheckboxGroup(choices=get_available_tools(), value=[], label='Tools', info='Functions the model can call during generation.')
-                ui.create_refresh_button(shared.gradio['selected_tools'], lambda: None, lambda: {'choices': get_available_tools()}, 'refresh-button')
+                shared.gradio['selected_tools'] = gr.CheckboxGroup(choices=get_available_tools(), value=[], label='Tools', info='Functions the model can call during generation.', elem_id='tools-group')
+                shared.gradio['tools_refresh'] = gr.Button('Refresh list', elem_id='tools-refresh-btn', visible=False)
+                shared.gradio['tools_refresh'].click(fn=lambda: gr.update(choices=get_available_tools()), inputs=[], outputs=[shared.gradio['selected_tools']])
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 

From 5c02b7f60369be70b52ccf0432a69380b7f5ab4a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 17:08:30 -0700
Subject: [PATCH 051/210] Allow the fetch_webpage tool to return links

---
 modules/tool_use.py   | 3 ++-
 modules/ui_chat.py    | 2 +-
 modules/web_search.py | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/modules/tool_use.py b/modules/tool_use.py
index cb1e140d..e464d5d4 100644
--- a/modules/tool_use.py
+++ b/modules/tool_use.py
@@ -4,13 +4,14 @@ import random
 
 from modules import shared
 from modules.logging_colors import logger
+from modules.utils import natural_keys
 
 
 def get_available_tools():
     """Return sorted list of tool script names from user_data/tools/*.py."""
     tools_dir = shared.user_data_dir / 'tools'
     tools_dir.mkdir(parents=True, exist_ok=True)
-    return sorted(p.stem for p in tools_dir.glob('*.py'))
+    return sorted((p.stem for p in tools_dir.glob('*.py')), key=natural_keys)
 
 
 def load_tools(selected_names):
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 8112d956..039b9af6 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -92,7 +92,7 @@ def create_ui():
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
                 from modules.tool_use import get_available_tools
-                shared.gradio['selected_tools'] = gr.CheckboxGroup(choices=get_available_tools(), value=[], label='Tools', info='Functions the model can call during generation.', elem_id='tools-group')
+                shared.gradio['selected_tools'] = gr.CheckboxGroup(choices=get_available_tools(), value=shared.settings.get('selected_tools', []), label='Tools', info='Functions the model can call during generation.', elem_id='tools-group')
                 shared.gradio['tools_refresh'] = gr.Button('Refresh list', elem_id='tools-refresh-btn', visible=False)
                 shared.gradio['tools_refresh'].click(fn=lambda: gr.update(choices=get_available_tools()), inputs=[], outputs=[shared.gradio['selected_tools']])
 
diff --git a/modules/web_search.py b/modules/web_search.py
index 597af4b2..b14cd042 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -18,7 +18,7 @@ def get_current_timestamp():
     return datetime.now().strftime('%b %d, %Y %H:%M')
 
 
-def download_web_page(url, timeout=10):
+def download_web_page(url, timeout=10, include_links=False):
     """
     Download a web page and convert its HTML content to structured Markdown text.
     """
@@ -35,7 +35,7 @@ def download_web_page(url, timeout=10):
         h = html2text.HTML2Text()
         h.body_width = 0
         h.ignore_images = True
-        h.ignore_links = True
+        h.ignore_links = not include_links
 
         # Convert the HTML to Markdown
         markdown_text = h.handle(response.text)

From f8936ec47c95e4147116a2d497e5b94a2d7c99ba Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 17:10:41 -0700
Subject: [PATCH 052/210] Truncate web_search and fetch_webpage tools to 8192
 tokens

---
 user_data/tools/web_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/user_data/tools/web_search.py b/user_data/tools/web_search.py
index 8923eab0..46a45a8c 100644
--- a/user_data/tools/web_search.py
+++ b/user_data/tools/web_search.py
@@ -1,4 +1,4 @@
-from modules.web_search import perform_web_search
+from modules.web_search import perform_web_search, truncate_content_by_tokens
 
 tool = {
     "type": "function",
@@ -22,6 +22,6 @@ def execute(arguments):
     output = []
     for r in results:
         if r and r["content"].strip():
-            output.append({"title": r["title"], "url": r["url"], "content": r["content"][:4000]})
+            output.append({"title": r["title"], "url": r["url"], "content": truncate_content_by_tokens(r["content"])})
 
     return output if output else [{"error": "No results found."}]

From 1b7e6c57057fdddb57c1585e10c9ad6ccf59e856 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 17:11:05 -0700
Subject: [PATCH 053/210] Add the fetch_webpage tool source

---
 user_data/tools/fetch_webpage.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 user_data/tools/fetch_webpage.py

diff --git a/user_data/tools/fetch_webpage.py b/user_data/tools/fetch_webpage.py
new file mode 100644
index 00000000..e514491c
--- /dev/null
+++ b/user_data/tools/fetch_webpage.py
@@ -0,0 +1,28 @@
+from modules.web_search import download_web_page, truncate_content_by_tokens
+
+tool = {
+    "type": "function",
+    "function": {
+        "name": "fetch_webpage",
+        "description": "Fetch and read the contents of a web page given its URL. Returns the page content as plain text.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "url": {"type": "string", "description": "The URL of the web page to fetch."},
+            },
+            "required": ["url"]
+        }
+    }
+}
+
+
+def execute(arguments):
+    url = arguments.get("url", "")
+    if not url:
+        return {"error": "No URL provided."}
+
+    content = download_web_page(url, include_links=True)
+    if not content or not content.strip():
+        return {"error": f"Failed to fetch content from {url}"}
+
+    return {"url": url, "content": truncate_content_by_tokens(content)}

From a09f21b9de44480d92cb788053852721806c376a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 22:17:20 -0300
Subject: [PATCH 054/210] UI: Fix tool calling for GPT-OSS and Continue

---
 extensions/openai/utils.py |  7 ++-
 modules/chat.py            | 88 +++++++++++++++++++++++++-------------
 modules/reasoning.py       |  1 +
 3 files changed, 64 insertions(+), 32 deletions(-)

diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py
index 6fb05f08..eb34ce88 100644
--- a/extensions/openai/utils.py
+++ b/extensions/openai/utils.py
@@ -120,12 +120,14 @@ def _parseChannelToolCalls(answer: str, tool_names: list[str]):
     """Parse channel-based tool calls used by GPT-OSS and similar models.
 
     Format:
+        <|start|>assistant to=functions.func_name<|channel|>commentary json<|message|>{"arg": "value"}
+    or:
         <|channel|>commentary to=functions.func_name <|constrain|>json<|message|>{"arg": "value"}
     """
     matches = []
     start_pos = None
     for m in re.finditer(
-        r'<\|channel\|>commentary to=functions\.([^<\s]+)\s*(?:<\|constrain\|>json)?<\|message\|>',
+        r'<\|channel\|>\w+ to=functions\.([^<\s]+).*?<\|message\|>',
         answer
     ):
         func_name = m.group(1).strip()
@@ -137,7 +139,8 @@ def _parseChannelToolCalls(answer: str, tool_names: list[str]):
         try:
             arguments = json.loads(json_str)
             if start_pos is None:
-                start_pos = m.start()
+                prefix = answer.rfind('<|start|>assistant', 0, m.start())
+                start_pos = prefix if prefix != -1 else m.start()
             matches.append({
                 "type": "function",
                 "function": {
diff --git a/modules/chat.py b/modules/chat.py
index e4d5dd30..19f7d07f 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -81,6 +81,13 @@ jinja_env = ImmutableSandboxedEnvironment(
 )
 jinja_env.globals["strftime_now"] = strftime_now
 
+
+def _raise_exception(message):
+    raise ValueError(message)
+
+
+jinja_env.globals["raise_exception"] = _raise_exception
+
 _template_cache = {}
 
 
@@ -1048,16 +1055,23 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             yield output
 
     if _continue:
-        # Reprocess the entire internal text for extensions (like translation)
-        full_internal = output['internal'][-1][1]
-        if state['mode'] in ['chat', 'chat-instruct']:
-            full_visible = re.sub("(<USER>|<user>|{{user}})", state['name1'], full_internal)
-        else:
-            full_visible = full_internal
+        # Reprocess the entire internal text for extensions (like translation).
+        # Skip the rebuild when the visible text contains <tool_call> markers,
+        # since those only exist in visible (internal is cleared after each tool
+        # execution) and rebuilding from internal would destroy them.
+        if '<tool_call>' not in output['visible'][-1][1]:
+            full_internal = output['internal'][-1][1]
+            if state['mode'] in ['chat', 'chat-instruct']:
+                full_visible = re.sub("(<USER>|<user>|{{user}})", state['name1'], full_internal)
+            else:
+                full_visible = full_internal
 
-        full_visible = html.escape(full_visible)
-        if not state.get('_skip_output_extensions'):
-            output['visible'][-1][1] = apply_extensions('output', full_visible, state, is_chat=True)
+            full_visible = html.escape(full_visible)
+            if not state.get('_skip_output_extensions'):
+                output['visible'][-1][1] = apply_extensions('output', full_visible, state, is_chat=True)
+        else:
+            if not state.get('_skip_output_extensions'):
+                output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
     else:
         if not state.get('_skip_output_extensions'):
             output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
@@ -1222,6 +1236,18 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         # so we can extract thinking content from just this turn's output.
         _model_visible = history['visible'][-1][1]
 
+        # Recover visible_prefix from existing visible text (e.g. on Continue
+        # after a previous session had tool calls). Extract all <tool_call>
+        # blocks and any text between them (thinking blocks, intermediate text).
+        if not visible_prefix and _model_visible:
+            tc_matches = list(re.finditer(r'<tool_call>.*?</tool_call>', _model_visible, re.DOTALL))
+            if tc_matches:
+                prefix_end = tc_matches[-1].end()
+                prefix = _model_visible[:prefix_end].strip()
+                if prefix:
+                    visible_prefix = [prefix]
+                _model_visible = _model_visible[prefix_end:].strip()
+
         # Re-apply visible prefix to the final state after streaming completes.
         # This is safe because we're no longer sharing the object with chatbot_wrapper.
         if visible_prefix:
@@ -1244,20 +1270,35 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         meta = history.get('metadata', {})
         seq = meta.setdefault(f'assistant_{row_idx}', {}).setdefault('tool_sequence', [])
 
-        # Serialize tool calls
+        def _render():
+            return chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+
+        # Serialize tool calls and build display headers in one pass
         serialized = []
+        tc_headers = []
         for tc in parsed_calls:
             tc['id'] = generate_tool_call_id()
-            args = tc['function'].get('arguments', {})
+            fn_name = tc['function']['name']
+            fn_args = tc['function'].get('arguments', {})
+
             serialized.append({
                 'id': tc['id'],
                 'type': 'function',
                 'function': {
-                    'name': tc['function']['name'],
-                    'arguments': json.dumps(args) if isinstance(args, dict) else args
+                    'name': fn_name,
+                    'arguments': json.dumps(fn_args) if isinstance(fn_args, dict) else fn_args
                 }
             })
 
+            if isinstance(fn_args, dict) and fn_args:
+                args_summary = ', '.join(f'{k}={json.dumps(v, ensure_ascii=False)}' for k, v in fn_args.items())
+            elif isinstance(fn_args, dict):
+                args_summary = ''
+            else:
+                args_summary = str(fn_args)
+
+            tc_headers.append(f'{fn_name}({args_summary})')
+
         seq.append({'tool_calls': serialized})
 
         # Clear internal (raw tool markup)
@@ -1274,24 +1315,11 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         if intermediate and intermediate.strip():
             visible_prefix.append(intermediate.strip())
 
-        # Build args summaries and show placeholder accordions with "..."
-        # before execution starts (tool calls may be slow, e.g. web search).
-        tc_headers = []
-        for tc in parsed_calls:
-            fn_name = tc['function']['name']
-            fn_args = tc['function'].get('arguments', {})
-            if isinstance(fn_args, dict) and fn_args:
-                args_summary = ', '.join(f'{k}={json.dumps(v, ensure_ascii=False)}' for k, v in fn_args.items())
-            elif isinstance(fn_args, dict):
-                args_summary = ''
-            else:
-                args_summary = str(fn_args)
-
-            tc_headers.append(f'{fn_name}({args_summary})')
-
+        # Show placeholder accordions with "..." before execution starts
+        # (tool calls may be slow, e.g. web search).
         pending_placeholders = [f'<tool_call>{h}\n...\n</tool_call>' for h in tc_headers]
         history['visible'][-1][1] = '\n\n'.join(visible_prefix + pending_placeholders)
-        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
+        yield _render(), history
 
         # Execute tools, store results, and replace placeholders with real results
         for i, tc in enumerate(parsed_calls):
@@ -1308,7 +1336,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
             # Replace the placeholder with the real result
             pending_placeholders[i] = f'<tool_call>{tc_headers[i]}\n{pretty_result}\n</tool_call>'
             history['visible'][-1][1] = '\n\n'.join(visible_prefix + pending_placeholders)
-            yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
+            yield _render(), history
 
         # Move completed tool calls into visible_prefix for next turns
         visible_prefix.extend(pending_placeholders)
diff --git a/modules/reasoning.py b/modules/reasoning.py
index 12f8553d..708ee55a 100644
--- a/modules/reasoning.py
+++ b/modules/reasoning.py
@@ -5,6 +5,7 @@ import html as html_module
 THINKING_FORMATS = [
     ('<think>', '</think>', None),
     ('<|channel|>analysis<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'),
+    ('<|channel|>commentary<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'),
     ('<seed:think>', '</seed:think>', None),
     ('<|think|>', '<|end|>', '<|content|>'),  # Solar Open
     ('Thinking Process:', '</think>', None),  # Qwen3.5 verbose thinking outside tags

From 4c7a56c18df369d5d04870d6280468900662d3cd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 22:17:23 -0300
Subject: [PATCH 055/210] Add num_pages and max_tokens kwargs to web search
 tools

---
 user_data/tools/fetch_webpage.py | 4 +++-
 user_data/tools/web_search.py    | 8 ++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/user_data/tools/fetch_webpage.py b/user_data/tools/fetch_webpage.py
index e514491c..ca3e7331 100644
--- a/user_data/tools/fetch_webpage.py
+++ b/user_data/tools/fetch_webpage.py
@@ -9,6 +9,7 @@ tool = {
             "type": "object",
             "properties": {
                 "url": {"type": "string", "description": "The URL of the web page to fetch."},
+                "max_tokens": {"type": "integer", "description": "Maximum number of tokens in the returned content (default: 2048)."},
             },
             "required": ["url"]
         }
@@ -18,6 +19,7 @@ tool = {
 
 def execute(arguments):
     url = arguments.get("url", "")
+    max_tokens = arguments.get("max_tokens", 2048)
     if not url:
         return {"error": "No URL provided."}
 
@@ -25,4 +27,4 @@ def execute(arguments):
     if not content or not content.strip():
         return {"error": f"Failed to fetch content from {url}"}
 
-    return {"url": url, "content": truncate_content_by_tokens(content)}
+    return {"url": url, "content": truncate_content_by_tokens(content, max_tokens=max_tokens)}
diff --git a/user_data/tools/web_search.py b/user_data/tools/web_search.py
index 46a45a8c..80845963 100644
--- a/user_data/tools/web_search.py
+++ b/user_data/tools/web_search.py
@@ -9,6 +9,8 @@ tool = {
             "type": "object",
             "properties": {
                 "query": {"type": "string", "description": "The search query."},
+                "num_pages": {"type": "integer", "description": "Number of search result pages to fetch (default: 3)."},
+                "max_tokens": {"type": "integer", "description": "Maximum number of tokens per page result (default: 2048)."},
             },
             "required": ["query"]
         }
@@ -18,10 +20,12 @@ tool = {
 
 def execute(arguments):
     query = arguments.get("query", "")
-    results = perform_web_search(query, num_pages=3)
+    num_pages = arguments.get("num_pages", 3)
+    max_tokens = arguments.get("max_tokens", 2048)
+    results = perform_web_search(query, num_pages=num_pages)
     output = []
     for r in results:
         if r and r["content"].strip():
-            output.append({"title": r["title"], "url": r["url"], "content": truncate_content_by_tokens(r["content"])})
+            output.append({"title": r["title"], "url": r["url"], "content": truncate_content_by_tokens(r["content"], max_tokens=max_tokens)})
 
     return output if output else [{"error": "No results found."}]

From 286ae475f6794cfb39bf9dabdd289b52eafd5c5b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 22:39:38 -0300
Subject: [PATCH 056/210] UI: Clean up tool calling code

---
 modules/chat.py     | 36 ++++++++++++++++++------------------
 modules/tool_use.py |  7 -------
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 19f7d07f..1649a6e8 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1056,9 +1056,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
     if _continue:
         # Reprocess the entire internal text for extensions (like translation).
-        # Skip the rebuild when the visible text contains <tool_call> markers,
+        # Skip entirely when the visible text contains <tool_call> markers,
         # since those only exist in visible (internal is cleared after each tool
-        # execution) and rebuilding from internal would destroy them.
+        # execution) and rebuilding from internal would destroy them. Output
+        # extensions also can't handle the raw <tool_call> markup safely.
         if '<tool_call>' not in output['visible'][-1][1]:
             full_internal = output['internal'][-1][1]
             if state['mode'] in ['chat', 'chat-instruct']:
@@ -1069,9 +1070,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             full_visible = html.escape(full_visible)
             if not state.get('_skip_output_extensions'):
                 output['visible'][-1][1] = apply_extensions('output', full_visible, state, is_chat=True)
-        else:
-            if not state.get('_skip_output_extensions'):
-                output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
     else:
         if not state.get('_skip_output_extensions'):
             output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
@@ -1141,16 +1139,6 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
     if not character_is_loaded(state):
         return
 
-    # On regenerate, clear old tool_sequence metadata so it gets rebuilt.
-    # Save it first so it can be stored per-version below.
-    _old_tool_sequence = None
-    if regenerate:
-        history = state['history']
-        meta = history.get('metadata', {})
-        row_idx = len(history['internal']) - 1
-        if row_idx >= 0:
-            _old_tool_sequence = meta.get(f'assistant_{row_idx}', {}).pop('tool_sequence', None)
-
     if state['start_with'] != '' and not _continue:
         if regenerate:
             text, state['history'] = remove_last_message(state['history'])
@@ -1160,13 +1148,25 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         send_dummy_message(text, state)
         send_dummy_reply(state['start_with'], state)
 
+    # On regenerate, clear old tool_sequence metadata so it gets rebuilt.
+    # Save it first so it can be stored per-version below.
+    # This must happen after the start_with logic above, which may remove
+    # and re-add messages, changing which row we operate on.
+    _old_tool_sequence = None
+    if regenerate:
+        history = state['history']
+        meta = history.get('metadata', {})
+        row_idx = len(history['internal']) - 1
+        if row_idx >= 0:
+            _old_tool_sequence = meta.get(f'assistant_{row_idx}', {}).pop('tool_sequence', None)
+
     # Load tools if any are selected
     selected = state.get('selected_tools', [])
     parseToolCall = None
     if selected:
-        from modules.tool_use import load_tools, execute_tool, generate_tool_call_id
+        from modules.tool_use import load_tools, execute_tool
         try:
-            from extensions.openai.utils import parseToolCall
+            from extensions.openai.utils import parseToolCall, getToolCallId
         except ImportError:
             logger.warning('Tool calling requires the openai extension for parseToolCall. Disabling tools.')
             selected = []
@@ -1277,7 +1277,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         serialized = []
         tc_headers = []
         for tc in parsed_calls:
-            tc['id'] = generate_tool_call_id()
+            tc['id'] = getToolCallId()
             fn_name = tc['function']['name']
             fn_args = tc['function'].get('arguments', {})
 
diff --git a/modules/tool_use.py b/modules/tool_use.py
index e464d5d4..d65c14ed 100644
--- a/modules/tool_use.py
+++ b/modules/tool_use.py
@@ -1,6 +1,5 @@
 import importlib.util
 import json
-import random
 
 from modules import shared
 from modules.logging_colors import logger
@@ -49,12 +48,6 @@ def load_tools(selected_names):
     return tool_defs, executors
 
 
-def generate_tool_call_id():
-    """Generate a unique tool call ID (e.g. 'call_a1b2c3d4')."""
-    chars = "abcdefghijklmnopqrstuvwxyz0123456789"
-    return "call_" + "".join(random.choice(chars) for _ in range(8))
-
-
 def execute_tool(func_name, arguments, executors):
     """Execute a tool by function name. Returns result as a JSON string."""
     fn = executors.get(func_name)

From 1ed56aee85b4c64232f3f6517a8b555140f1c8a8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 18:45:19 -0700
Subject: [PATCH 057/210] Add a calculate tool

---
 user_data/tools/calculate.py | 48 ++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 user_data/tools/calculate.py

diff --git a/user_data/tools/calculate.py b/user_data/tools/calculate.py
new file mode 100644
index 00000000..e88b71a3
--- /dev/null
+++ b/user_data/tools/calculate.py
@@ -0,0 +1,48 @@
+import ast
+import operator
+
+OPERATORS = {
+    ast.Add: operator.add,
+    ast.Sub: operator.sub,
+    ast.Mult: operator.mul,
+    ast.Div: operator.truediv,
+    ast.Pow: operator.pow,
+    ast.Mod: operator.mod,
+    ast.USub: operator.neg,
+}
+
+
+def _eval(node):
+    if isinstance(node, ast.Constant) and isinstance(node.value, (int, float)):
+        return node.value
+    elif isinstance(node, ast.BinOp) and type(node.op) in OPERATORS:
+        return OPERATORS[type(node.op)](_eval(node.left), _eval(node.right))
+    elif isinstance(node, ast.UnaryOp) and type(node.op) in OPERATORS:
+        return OPERATORS[type(node.op)](_eval(node.operand))
+    raise ValueError(f"Unsupported expression")
+
+
+tool = {
+    "type": "function",
+    "function": {
+        "name": "calculate",
+        "description": "Evaluate a math expression. Supports +, -, *, /, **, %.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "expression": {"type": "string", "description": "The math expression to evaluate (e.g. '2 * (3 + 4)')."},
+            },
+            "required": ["expression"]
+        }
+    }
+}
+
+
+def execute(arguments):
+    expr = arguments.get("expression", "")
+    try:
+        tree = ast.parse(expr, mode='eval')
+        result = _eval(tree.body)
+        return {"expression": expr, "result": result}
+    except Exception as e:
+        return {"error": str(e)}

From 0e35421593ebed4295116db35b323ac2e32c91a8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 18:52:41 -0700
Subject: [PATCH 058/210] API: Always extract reasoning_content, even with tool
 calls

---
 extensions/openai/completions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 0eb0cd27..290a5bc0 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -554,7 +554,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         else:
             yield chunk
     else:
-        reasoning, content = extract_reasoning(answer) if not tool_calls else (None, answer)
+        reasoning, content = extract_reasoning(answer)
         message = {
             "role": "assistant",
             "refusal": None,

From 58f26a4cc7d0cec9cc22e4b32bc605540bcdef4f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 19:18:55 -0700
Subject: [PATCH 059/210] UI: Skip redundant work in chat loop when no tools
 are selected

---
 modules/chat.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 1649a6e8..c93972fb 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1239,7 +1239,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         # Recover visible_prefix from existing visible text (e.g. on Continue
         # after a previous session had tool calls). Extract all <tool_call>
         # blocks and any text between them (thinking blocks, intermediate text).
-        if not visible_prefix and _model_visible:
+        if tool_func_names and not visible_prefix and _model_visible:
             tc_matches = list(re.finditer(r'<tool_call>.*?</tool_call>', _model_visible, re.DOTALL))
             if tc_matches:
                 prefix_end = tc_matches[-1].end()
@@ -1253,7 +1253,8 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         if visible_prefix:
             history['visible'][-1][1] = '\n\n'.join(visible_prefix + [_model_visible])
 
-        save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+        if tool_func_names:
+            save_history(history, state['unique_id'], state['character_menu'], state['mode'])
 
         # Check for tool calls
         if not tool_func_names or shared.stop_everything:

From 04213dff143cc5a79aaa92fe3ba76271925833ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 19:55:02 -0700
Subject: [PATCH 060/210] Address copilot feedback

---
 modules/llama_cpp_server.py  | 2 +-
 modules/models_settings.py   | 3 ++-
 modules/text_generation.py   | 2 +-
 modules/tool_use.py          | 3 +++
 user_data/tools/calculate.py | 6 +++++-
 user_data/tools/roll_dice.py | 4 ++--
 6 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 192aa9e4..1425844d 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -136,7 +136,7 @@ class LlamaServer:
 
         logit_bias = []
         if state['custom_token_bans']:
-            logit_bias.extend([[int(token_id), False] for token_id in state['custom_token_bans'].split(',')])
+            logit_bias.extend([[int(token_id.strip()), False] for token_id in state['custom_token_bans'].split(',') if token_id.strip()])
 
         if state.get('logit_bias'):
             for token_id_str, bias in state['logit_bias'].items():
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 5e69b60e..0e117176 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -431,7 +431,8 @@ def load_instruction_template(template):
     else:
         return ''
 
-    file_contents = open(filepath, 'r', encoding='utf-8').read()
+    with open(filepath, 'r', encoding='utf-8') as f:
+        file_contents = f.read()
     data = yaml.safe_load(file_contents)
     if 'instruction_template' in data:
         return data['instruction_template']
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 787c1814..d487cd2f 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -378,7 +378,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
         generate_params['sampler_priority'] = [x.strip() for x in state['sampler_priority'].replace('\n', ',').split(',') if x.strip()]
 
     if state['custom_token_bans']:
-        to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
+        to_ban = [int(x.strip()) for x in state['custom_token_bans'].split(',') if x.strip()]
         if len(to_ban) > 0:
             if generate_params.get('suppress_tokens', None):
                 generate_params['suppress_tokens'] += to_ban
diff --git a/modules/tool_use.py b/modules/tool_use.py
index d65c14ed..55424853 100644
--- a/modules/tool_use.py
+++ b/modules/tool_use.py
@@ -42,6 +42,9 @@ def load_tools(selected_names):
             continue
 
         func_name = tool_def.get('function', {}).get('name', name)
+        if func_name in executors:
+            logger.warning(f'Tool "{name}" declares function name "{func_name}" which conflicts with an already loaded tool. Skipping.')
+            continue
         tool_defs.append(tool_def)
         executors[func_name] = execute_fn
 
diff --git a/user_data/tools/calculate.py b/user_data/tools/calculate.py
index e88b71a3..94f74c41 100644
--- a/user_data/tools/calculate.py
+++ b/user_data/tools/calculate.py
@@ -16,7 +16,11 @@ def _eval(node):
     if isinstance(node, ast.Constant) and isinstance(node.value, (int, float)):
         return node.value
     elif isinstance(node, ast.BinOp) and type(node.op) in OPERATORS:
-        return OPERATORS[type(node.op)](_eval(node.left), _eval(node.right))
+        left = _eval(node.left)
+        right = _eval(node.right)
+        if isinstance(node.op, ast.Pow) and isinstance(right, (int, float)) and abs(right) > 10000:
+            raise ValueError("Exponent too large (max 10000)")
+        return OPERATORS[type(node.op)](left, right)
     elif isinstance(node, ast.UnaryOp) and type(node.op) in OPERATORS:
         return OPERATORS[type(node.op)](_eval(node.operand))
     raise ValueError(f"Unsupported expression")
diff --git a/user_data/tools/roll_dice.py b/user_data/tools/roll_dice.py
index 9cab48a8..4af38ddc 100644
--- a/user_data/tools/roll_dice.py
+++ b/user_data/tools/roll_dice.py
@@ -17,7 +17,7 @@ tool = {
 
 
 def execute(arguments):
-    count = arguments.get("count", 1)
-    sides = arguments.get("sides", 20)
+    count = max(1, min(arguments.get("count", 1), 1000))
+    sides = max(2, min(arguments.get("sides", 20), 1000))
     rolls = [random.randint(1, sides) for _ in range(count)]
     return {"rolls": rolls, "total": sum(rolls)}

From 85ec85e569df750724551bf10d5970ee8e7eb351 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 20:21:01 -0700
Subject: [PATCH 061/210] UI: Fix Continue while in a tool-calling loop, remove
 the upper limit on number of tool calls

---
 modules/chat.py | 84 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 59 insertions(+), 25 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index c93972fb..b93f2b9d 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -161,6 +161,49 @@ def _deserialize_tool_call_arguments(tool_calls):
     return result
 
 
+def _expand_tool_sequence(tool_seq):
+    """Expand a tool_sequence list into API messages.
+
+    Returns a list of dicts (role: assistant with tool_calls, or role: tool).
+    If any tool_call IDs are missing a matching tool result, a synthetic
+    empty result is inserted so the prompt is never malformed.
+    """
+    messages = []
+    expected_ids = []
+    seen_ids = set()
+
+    for item in tool_seq:
+        if 'tool_calls' in item:
+            deserialized = _deserialize_tool_call_arguments(item['tool_calls'])
+            messages.append({
+                "role": "assistant",
+                "content": "",
+                "tool_calls": deserialized
+            })
+            for tc in item['tool_calls']:
+                tc_id = tc.get('id', '')
+                if tc_id:
+                    expected_ids.append(tc_id)
+        elif item.get('role') == 'tool':
+            messages.append({
+                "role": "tool",
+                "content": item['content'],
+                "tool_call_id": item.get('tool_call_id', '')
+            })
+            seen_ids.add(item.get('tool_call_id', ''))
+
+    # Fill in synthetic results for any orphaned tool call IDs
+    for tc_id in expected_ids:
+        if tc_id not in seen_ids:
+            messages.append({
+                "role": "tool",
+                "content": "",
+                "tool_call_id": tc_id
+            })
+
+    return messages
+
+
 def generate_chat_prompt(user_input, state, **kwargs):
     impersonate = kwargs.get('impersonate', False)
     _continue = kwargs.get('_continue', False)
@@ -312,17 +355,8 @@ def generate_chat_prompt(user_input, state, **kwargs):
         meta_key = f"assistant_{row_idx}"
         tool_seq = metadata.get(meta_key, {}).get('tool_sequence', [])
         if tool_seq:
-            for item in reversed(tool_seq):
-                if 'tool_calls' in item:
-                    messages.insert(insert_pos, {
-                        "role": "assistant", "content": "",
-                        "tool_calls": _deserialize_tool_call_arguments(item['tool_calls'])
-                    })
-                elif item.get('role') == 'tool':
-                    messages.insert(insert_pos, {
-                        "role": "tool", "content": item['content'],
-                        "tool_call_id": item.get('tool_call_id', '')
-                    })
+            for msg in reversed(_expand_tool_sequence(tool_seq)):
+                messages.insert(insert_pos, msg)
 
         if entry_meta.get('role') == 'system':
             if user_msg:
@@ -400,17 +434,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
         # history loop during regenerate — needed so the model sees prior
         # tool calls and results when re-generating the final answer).
         current_tool_seq = metadata.get(f"assistant_{len(history)}", {}).get('tool_sequence', [])
-        for item in current_tool_seq:
-            if 'tool_calls' in item:
-                messages.append({
-                    "role": "assistant", "content": "",
-                    "tool_calls": _deserialize_tool_call_arguments(item['tool_calls'])
-                })
-            elif item.get('role') == 'tool':
-                messages.append({
-                    "role": "tool", "content": item['content'],
-                    "tool_call_id": item.get('tool_call_id', '')
-                })
+        messages.extend(_expand_tool_sequence(current_tool_seq))
 
     if impersonate and state['mode'] != 'chat-instruct':
         messages.append({"role": "user", "content": "fake user message replace me"})
@@ -1181,9 +1205,8 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
     visible_prefix = []  # Accumulated tool call summaries + results
     last_save_time = time.monotonic()
     save_interval = 8
-    max_tool_turns = 10
-
-    for _tool_turn in range(max_tool_turns):
+    _tool_turn = 0
+    while True:
         history = state['history']
 
         # Turn 0: use original flags; turns 2+: regenerate into the same entry.
@@ -1324,6 +1347,16 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
 
         # Execute tools, store results, and replace placeholders with real results
         for i, tc in enumerate(parsed_calls):
+            # Check for stop request before each tool execution
+            if shared.stop_everything:
+                for j in range(i, len(parsed_calls)):
+                    seq.append({'role': 'tool', 'content': 'Tool execution was cancelled by the user.', 'tool_call_id': parsed_calls[j]['id']})
+                    pending_placeholders[j] = f'<tool_call>{tc_headers[j]}\nCancelled\n</tool_call>'
+
+                history['visible'][-1][1] = '\n\n'.join(visible_prefix + pending_placeholders)
+                yield _render(), history
+                break
+
             fn_name = tc['function']['name']
             fn_args = tc['function'].get('arguments', {})
             result = execute_tool(fn_name, fn_args, tool_executors)
@@ -1345,6 +1378,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         save_history(history, state['unique_id'], state['character_menu'], state['mode'])
 
         state['history'] = history
+        _tool_turn += 1
 
     state.pop('_tool_turn', None)
     state['history'] = history

From c094bc943c5912d592b0f796b75989af4fc82dc4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 12 Mar 2026 21:45:38 -0700
Subject: [PATCH 062/210] UI: Skip output extensions on intermediate
 tool-calling turns

---
 modules/chat.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index b93f2b9d..57fd50e0 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1214,6 +1214,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         # that intermediate tool-loop regenerations don't pollute swipe history.
         if _tool_turn > 0:
             state['_tool_turn'] = True
+            state['_skip_output_extensions'] = True
 
         regen = regenerate if _tool_turn == 0 else True
         cont = _continue if _tool_turn == 0 else False
@@ -1381,6 +1382,18 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         _tool_turn += 1
 
     state.pop('_tool_turn', None)
+
+    # If output extensions were deferred during tool turns, apply them now
+    # to the final model response only (not to tool call markers).
+    if state.pop('_skip_output_extensions', None):
+        _model_visible = apply_extensions('output', _model_visible, state, is_chat=True)
+        if visible_prefix:
+            history['visible'][-1][1] = '\n\n'.join(visible_prefix + [_model_visible])
+        else:
+            history['visible'][-1][1] = _model_visible
+
+        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
+
     state['history'] = history
 
     # Sync version metadata so swipes show the full visible (with tool prefix)

From 5ddc1002d2f3d6cc8f65aad959dc810e92530319 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 02:40:09 -0700
Subject: [PATCH 063/210] Update ExLlamaV3 to 0.0.25

---
 requirements/full/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 12e7fbae..8e095fae 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -42,7 +42,7 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.24/exllamav3-0.0.24+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.24/exllamav3-0.0.24+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.25/exllamav3-0.0.25+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.25/exllamav3-0.0.25+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"

From a4bef860b6d743c78d0aed36c934cc022df37bee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 06:45:29 -0300
Subject: [PATCH 064/210] UI: Optimize chat streaming by batching morphdom to
 one update per animation frame

The monitor physically cannot paint faster than its refresh rate, so
intermediate morphdom calls between frames do redundant parsing, diffing,
and patching work that is never displayed.
---
 js/global_scope_js.js | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 62b31d37..3207a681 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -269,7 +269,21 @@ function removeLastClick() {
   document.getElementById("Remove-last").click();
 }
 
+let pendingMorphdomData = null;
+let morphdomRafId = null;
+
 function handleMorphdomUpdate(data) {
+  pendingMorphdomData = data;
+  if (!morphdomRafId) {
+    morphdomRafId = requestAnimationFrame(() => {
+      morphdomRafId = null;
+      applyMorphdomUpdate(pendingMorphdomData);
+      pendingMorphdomData = null;
+    });
+  }
+}
+
+function applyMorphdomUpdate(data) {
   // Determine target element and use it as query scope
   var target_element, target_html;
   if (data.last_message_only) {

From 5833d94d7fad6739ef72d3c169143fbc977211c2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 02:56:49 -0700
Subject: [PATCH 065/210] UI: Prevent word breaks in tables

---
 css/main.css | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/css/main.css b/css/main.css
index a7069f33..30c0d5f5 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1802,6 +1802,16 @@ table {
     border-collapse: collapse;
 }
 
+.message-body table {
+    display: block;
+    overflow-x: auto;
+}
+
+.message-body :is(td, th) {
+    word-break: normal;
+    overflow-wrap: normal;
+}
+
 table, tr, td, th, thead {
     border: 0;
 }

From fef95b9e56855707cc13e5dc1220193352bc4bc1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 03:05:09 -0700
Subject: [PATCH 066/210] UI: Fix an autoscroll race condition during chat
 streaming

---
 js/main.js | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/js/main.js b/js/main.js
index 0bc76b2a..135aa948 100644
--- a/js/main.js
+++ b/js/main.js
@@ -145,6 +145,7 @@ targetElement.classList.add("pretty_scrollbar");
 targetElement.classList.add("chat-parent");
 window.isScrolled = false;
 let scrollTimeout;
+let isProgrammaticScroll = false;
 
 targetElement.addEventListener("scroll", function() {
   let diff = targetElement.scrollHeight - targetElement.clientHeight;
@@ -157,9 +158,10 @@ targetElement.addEventListener("scroll", function() {
 
   if(isAtBottomNow) {
     window.isScrolled = false;
-  } else {
+  } else if (!isProgrammaticScroll) {
     window.isScrolled = true;
   }
+  isProgrammaticScroll = false;
 
   // Clear previous timeout and set new one
   clearTimeout(scrollTimeout);
@@ -193,6 +195,7 @@ const observer = new MutationObserver(function(mutations) {
   if (!window.isScrolled && !isScrollingClassOnly) {
     const maxScroll = targetElement.scrollHeight - targetElement.clientHeight;
     if (maxScroll > 0 && targetElement.scrollTop < maxScroll - 1) {
+      isProgrammaticScroll = true;
       targetElement.scrollTop = maxScroll;
     }
   }
@@ -1091,6 +1094,7 @@ document.fonts.addEventListener("loadingdone", (event) => {
     if (!window.isScrolled) {
       const maxScroll = targetElement.scrollHeight - targetElement.clientHeight;
       if (targetElement.scrollTop < maxScroll - 5) {
+        isProgrammaticScroll = true;
         targetElement.scrollTop = maxScroll;
       }
     }

From 46288256511c0e42fb9fb4cfeb8585a2a9362947 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 03:17:36 -0700
Subject: [PATCH 067/210] Better solution to
 fef95b9e56855707cc13e5dc1220193352bc4bc1

---
 js/main.js | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/js/main.js b/js/main.js
index 135aa948..c3e51c3c 100644
--- a/js/main.js
+++ b/js/main.js
@@ -145,7 +145,8 @@ targetElement.classList.add("pretty_scrollbar");
 targetElement.classList.add("chat-parent");
 window.isScrolled = false;
 let scrollTimeout;
-let isProgrammaticScroll = false;
+let lastScrollTop = 0;
+let lastScrollHeight = 0;
 
 targetElement.addEventListener("scroll", function() {
   let diff = targetElement.scrollHeight - targetElement.clientHeight;
@@ -158,10 +159,11 @@ targetElement.addEventListener("scroll", function() {
 
   if(isAtBottomNow) {
     window.isScrolled = false;
-  } else if (!isProgrammaticScroll) {
+  } else if (targetElement.scrollTop < lastScrollTop && targetElement.scrollHeight >= lastScrollHeight) {
     window.isScrolled = true;
   }
-  isProgrammaticScroll = false;
+  lastScrollTop = targetElement.scrollTop;
+  lastScrollHeight = targetElement.scrollHeight;
 
   // Clear previous timeout and set new one
   clearTimeout(scrollTimeout);
@@ -195,7 +197,6 @@ const observer = new MutationObserver(function(mutations) {
   if (!window.isScrolled && !isScrollingClassOnly) {
     const maxScroll = targetElement.scrollHeight - targetElement.clientHeight;
     if (maxScroll > 0 && targetElement.scrollTop < maxScroll - 1) {
-      isProgrammaticScroll = true;
       targetElement.scrollTop = maxScroll;
     }
   }
@@ -1094,7 +1095,6 @@ document.fonts.addEventListener("loadingdone", (event) => {
     if (!window.isScrolled) {
       const maxScroll = targetElement.scrollHeight - targetElement.clientHeight;
       if (targetElement.scrollTop < maxScroll - 5) {
-        isProgrammaticScroll = true;
         targetElement.scrollTop = maxScroll;
       }
     }

From c39c187f47b79b540b8b0abf18979b5103f48bd0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 03:21:47 -0700
Subject: [PATCH 068/210] UI: Improve the style of table scrollbars

---
 css/main.css              | 3 +--
 modules/html_generator.py | 3 +++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index 30c0d5f5..7cc496a7 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1802,8 +1802,7 @@ table {
     border-collapse: collapse;
 }
 
-.message-body table {
-    display: block;
+.table-wrapper {
     overflow-x: auto;
 }
 
diff --git a/modules/html_generator.py b/modules/html_generator.py
index f3811602..138d4ade 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -317,6 +317,9 @@ def process_markdown_content(string):
     # Unescape backslashes
     html_output = html_output.replace('\\\\', '\\')
 
+    # Wrap tables in a scrollable div
+    html_output = html_output.replace('<table>', '<div class="table-wrapper pretty_scrollbar"><table>').replace('</table>', '</table></div>')
+
     return html_output
 
 

From d0b72c73c08cc74c7d05c5afdbbef7f96b94dcfd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 03:43:02 -0700
Subject: [PATCH 069/210] Update diffusers to 0.37

---
 requirements/full/requirements.txt               | 2 +-
 requirements/full/requirements_amd.txt           | 2 +-
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_cpu_only.txt      | 2 +-
 requirements/full/requirements_nowheels.txt      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 8e095fae..03f4abac 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -2,7 +2,7 @@ accelerate==1.12.*
 audioop-lts<1.0; python_version >= "3.13"
 bitsandbytes==0.49.*
 datasets
-diffusers==0.36.*
+diffusers==0.37.*
 einops
 fastapi==0.112.4
 flash-linear-attention==0.4.*
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 19cc0d9d..f3551fa2 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -1,7 +1,7 @@
 accelerate==1.12.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
-diffusers==0.36.*
+diffusers==0.37.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ebe26f9d..5e0cf8ad 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -1,7 +1,7 @@
 accelerate==1.12.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
-diffusers==0.36.*
+diffusers==0.37.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 49155690..d55c3e24 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -1,7 +1,7 @@
 accelerate==1.12.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
-diffusers==0.36.*
+diffusers==0.37.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 1c7c5735..34e864ac 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -1,7 +1,7 @@
 accelerate==1.12.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
-diffusers==0.36.*
+diffusers==0.37.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 63823db8..6128c0ed 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -1,7 +1,7 @@
 accelerate==1.12.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
-diffusers==0.36.*
+diffusers==0.37.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15

From b7670cc762206445e4e0ffed76f535c661bb85f4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 04:00:30 -0700
Subject: [PATCH 070/210] Add a tool calling tutorial

---
 docs/Tool Calling Tutorial.md | 144 ++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 docs/Tool Calling Tutorial.md

diff --git a/docs/Tool Calling Tutorial.md b/docs/Tool Calling Tutorial.md
new file mode 100644
index 00000000..170bdff7
--- /dev/null
+++ b/docs/Tool Calling Tutorial.md	
@@ -0,0 +1,144 @@
+## Tool calling in the UI
+
+### 1. Load a model with tool-calling support
+
+Load a model with tool-calling support (Qwen, Mistral, Llama 4, etc.) from the Model tab.
+
+### 2. Select tools
+
+In the chat sidebar, check the tools you want the model to use:
+
+- **web_search** -- Search the web using DuckDuckGo.
+- **fetch_webpage** -- Fetch the content of a URL.
+- **calculate** -- Evaluate math expressions.
+- **get_datetime** -- Get the current date and time.
+- **roll_dice** -- Roll dice.
+
+### 3. Chat
+
+Send a message as usual. When the model decides it needs a tool, it will call it automatically. You will see each tool call and its result in a collapsible accordion inside the chat message.
+
+The model may call multiple tools in sequence before giving its final answer.
+
+## Writing custom tools
+
+Each tool is a single `.py` file in `user_data/tools/`. It needs two things:
+
+1. A `tool` dictionary that describes the function (name, description, parameters).
+2. An `execute(arguments)` function that runs it and returns the result.
+
+Here is a minimal example (`user_data/tools/get_datetime.py`):
+
+```python
+from datetime import datetime
+
+tool = {
+    "type": "function",
+    "function": {
+        "name": "get_datetime",
+        "description": "Get the current date and time.",
+        "parameters": {
+            "type": "object",
+            "properties": {},
+        }
+    }
+}
+
+
+def execute(arguments):
+    now = datetime.now()
+    return {"date": now.strftime("%Y-%m-%d"), "time": now.strftime("%I:%M %p")}
+```
+
+An example with parameters (`user_data/tools/roll_dice.py`):
+
+```python
+import random
+
+tool = {
+    "type": "function",
+    "function": {
+        "name": "roll_dice",
+        "description": "Roll one or more dice with the specified number of sides.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "count": {"type": "integer", "description": "Number of dice to roll.", "default": 1},
+                "sides": {"type": "integer", "description": "Number of sides per die.", "default": 20},
+            },
+        }
+    }
+}
+
+
+def execute(arguments):
+    count = max(1, min(arguments.get("count", 1), 1000))
+    sides = max(2, min(arguments.get("sides", 20), 1000))
+    rolls = [random.randint(1, sides) for _ in range(count)]
+    return {"rolls": rolls, "total": sum(rolls)}
+```
+
+You can open the built-in tools in `user_data/tools/` for more examples.
+
+## Tool calling over the API
+
+Tool calling over the API follows the [OpenAI API](https://platform.openai.com/docs/guides/function-calling) convention. Define your tools, send them with your messages, and handle tool calls in a loop until the model gives a final answer.
+
+```python
+import json
+import requests
+
+url = "http://127.0.0.1:5000/v1/chat/completions"
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather for a given location.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {"type": "string", "description": "City name"},
+                },
+                "required": ["location"]
+            }
+        }
+    }
+]
+
+
+def execute_tool(name, arguments):
+    if name == "get_weather":
+        return {"temperature": "14°C", "condition": "partly cloudy"}
+    return {"error": f"Unknown tool: {name}"}
+
+
+messages = [{"role": "user", "content": "What's the weather like in Paris?"}]
+
+for _ in range(10):
+    response = requests.post(url, json={"messages": messages, "tools": tools}).json()
+    choice = response["choices"][0]
+
+    if choice["finish_reason"] == "tool_calls":
+        messages.append({
+            "role": "assistant",
+            "content": choice["message"]["content"],
+            "tool_calls": choice["message"]["tool_calls"],
+        })
+
+        for tool_call in choice["message"]["tool_calls"]:
+            name = tool_call["function"]["name"]
+            arguments = json.loads(tool_call["function"]["arguments"])
+            result = execute_tool(name, arguments)
+            print(f"Tool call: {name}({arguments}) => {result}")
+
+            messages.append({
+                "role": "tool",
+                "tool_call_id": tool_call["id"],
+                "content": json.dumps(result),
+            })
+    else:
+        print(f"\nAssistant: {choice['message']['content']}")
+        break
+```

From e50b823eee369f50981fcaaef3bf05c8dc3e350d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 06:22:28 -0700
Subject: [PATCH 071/210] Update llama.cpp

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 12 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 03f4abac..c702a8d3 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.25/exllamav3-0.0.25+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.25/exllamav3-0.0.25+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index f3551fa2..65a9aa00 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 5e0cf8ad..bba62491 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d55c3e24..61dbf51b 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 34e864ac..384a552a 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index db23d4bf..0e3d67d3 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index e8cd9fd9..729829b3 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 24c558a9..4b16414c 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index f2e8e691..3a1764dc 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 296c0432..9d115c86 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index aefce769..4472e1d4 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 76bb5872..dad7ee9f 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.91.0/llama_cpp_binaries-0.91.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From e0a38da9f31c95332a5ca863217b1b2e485aecdc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 11:00:12 -0300
Subject: [PATCH 072/210] Improve tool call parsing for Devstral/GPT-OSS and
 preserve thinking across tool turns

---
 extensions/openai/utils.py | 51 +++++++++++++++++++++++++++++++++++---
 modules/chat.py            | 16 ++++++++++--
 2 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py
index eb34ce88..b179c267 100644
--- a/extensions/openai/utils.py
+++ b/extensions/openai/utils.py
@@ -126,8 +126,49 @@ def _parseChannelToolCalls(answer: str, tool_names: list[str]):
     """
     matches = []
     start_pos = None
-    for m in re.finditer(
+    # Pattern 1: to=functions.NAME before <|channel|> (GPT-OSS primary format)
+    # Pattern 2: to=functions.NAME after <|channel|> (alternative format)
+    patterns = [
+        r'to=functions\.([^<\s]+)\s*<\|channel\|>[^<]*<\|message\|>',
         r'<\|channel\|>\w+ to=functions\.([^<\s]+).*?<\|message\|>',
+    ]
+    for pattern in patterns:
+        for m in re.finditer(pattern, answer):
+            func_name = m.group(1).strip()
+            if func_name not in tool_names:
+                continue
+            json_str = _extractBalancedJson(answer, m.end())
+            if json_str is None:
+                continue
+            try:
+                arguments = json.loads(json_str)
+                if start_pos is None:
+                    prefix = answer.rfind('<|start|>assistant', 0, m.start())
+                    start_pos = prefix if prefix != -1 else m.start()
+                matches.append({
+                    "type": "function",
+                    "function": {
+                        "name": func_name,
+                        "arguments": arguments
+                    }
+                })
+            except json.JSONDecodeError:
+                pass
+        if matches:
+            break
+    return matches, start_pos
+
+
+def _parseMistralTokenToolCalls(answer: str, tool_names: list[str]):
+    """Parse Mistral/Devstral-style tool calls with [TOOL_CALLS] and [ARGS] special tokens.
+
+    Format:
+        [TOOL_CALLS]func_name[ARGS]{"arg": "value"}
+    """
+    matches = []
+    start_pos = None
+    for m in re.finditer(
+        r'\[TOOL_CALLS\]\s*(\S+?)\s*\[ARGS\]\s*',
         answer
     ):
         func_name = m.group(1).strip()
@@ -139,8 +180,7 @@ def _parseChannelToolCalls(answer: str, tool_names: list[str]):
         try:
             arguments = json.loads(json_str)
             if start_pos is None:
-                prefix = answer.rfind('<|start|>assistant', 0, m.start())
-                start_pos = prefix if prefix != -1 else m.start()
+                start_pos = m.start()
             matches.append({
                 "type": "function",
                 "function": {
@@ -497,6 +537,11 @@ def parseToolCall(answer: str, tool_names: list[str], return_prefix: bool = Fals
     if matches:
         return _return(matches, start_pos)
 
+    # Check for Mistral/Devstral-style tool calls ([TOOL_CALLS]name[ARGS]json)
+    matches, start_pos = _parseMistralTokenToolCalls(answer, tool_names)
+    if matches:
+        return _return(matches, start_pos)
+
     # Check for bare function-name style tool calls (e.g. Mistral format)
     matches, start_pos = _parseBareNameToolCalls(answer, tool_names)
     if matches:
diff --git a/modules/chat.py b/modules/chat.py
index 57fd50e0..2c6f0ab2 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -177,7 +177,7 @@ def _expand_tool_sequence(tool_seq):
             deserialized = _deserialize_tool_call_arguments(item['tool_calls'])
             messages.append({
                 "role": "assistant",
-                "content": "",
+                "content": item.get('content', ''),
                 "tool_calls": deserialized
             })
             for tc in item['tool_calls']:
@@ -1324,7 +1324,19 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
 
             tc_headers.append(f'{fn_name}({args_summary})')
 
-        seq.append({'tool_calls': serialized})
+        seq_entry = {'tool_calls': serialized}
+        if content_prefix.strip():
+            # Strip GPT-OSS channel tokens so they don't get double-wrapped
+            # by the template (which adds its own channel markup).
+            clean = content_prefix.strip()
+            if '<|channel|>' in clean and '<|message|>' in clean:
+                inner = clean.split('<|message|>', 1)[1] if '<|message|>' in clean else clean
+                if '<|end|>' in inner:
+                    inner = inner.split('<|end|>', 1)[0]
+                clean = inner.strip()
+            if clean:
+                seq_entry['content'] = clean
+        seq.append(seq_entry)
 
         # Clear internal (raw tool markup)
         history['internal'][-1][1] = ''

From aab2596d29a850a185eb0c25c83f7dcf7387d9fc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 15:47:11 -0300
Subject: [PATCH 073/210] UI: Fix multiple thinking blocks rendering as raw
 text in HTML generator

---
 modules/html_generator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 138d4ade..8f3f261f 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -369,8 +369,10 @@ def convert_to_markdown(string, message_id=None):
         if not text.strip():
             return
 
-        thinking_content, remaining = extract_thinking_block(text)
-        if thinking_content is not None:
+        while text.strip():
+            thinking_content, remaining = extract_thinking_block(text)
+            if thinking_content is None:
+                break
             has_remaining = bool(remaining.strip()) or not is_last_segment
             html_parts.append(build_thinking_block(thinking_content, message_id, has_remaining, think_idx))
             think_idx += 1

From d4c22ced83dd6fed7bb2785c7e6d5f7cbeace2db Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 15:47:14 -0300
Subject: [PATCH 074/210] UI: Optimize syntax highlighting and autoscroll by
 moving from MutationObserver to morphdom updates

---
 js/global_scope_js.js |  45 ++++++++++++++++
 js/main.js            | 121 ++++++++++++++----------------------------
 2 files changed, 84 insertions(+), 82 deletions(-)

diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 3207a681..425c2c59 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -269,6 +269,34 @@ function removeLastClick() {
   document.getElementById("Remove-last").click();
 }
 
+function autoScrollToBottom() {
+  if (!window.isScrolled) {
+    const chatParent = document.getElementById("chat")?.parentNode?.parentNode?.parentNode;
+    if (chatParent) {
+      const maxScroll = chatParent.scrollHeight - chatParent.clientHeight;
+      if (maxScroll > 0 && chatParent.scrollTop < maxScroll - 1) {
+        chatParent.scrollTop = maxScroll;
+      }
+    }
+  }
+}
+
+function updateInstructPadding() {
+  const chatElement = document.getElementById("chat");
+  if (chatElement && chatElement.getAttribute("data-mode") === "instruct") {
+    const messagesContainer = chatElement.querySelector(".messages");
+    const lastChild = messagesContainer?.lastElementChild;
+    const prevSibling = lastChild?.previousElementSibling;
+    if (lastChild && prevSibling) {
+      let bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight);
+      if (window.innerWidth <= 924) {
+        bufferHeight = Math.max(0, bufferHeight - 32);
+      }
+      messagesContainer.style.paddingBottom = `${bufferHeight}px`;
+    }
+  }
+}
+
 let pendingMorphdomData = null;
 let morphdomRafId = null;
 
@@ -373,10 +401,23 @@ function applyMorphdomUpdate(data) {
     }
   );
 
+  // Syntax highlighting and LaTeX
+  if (window.doSyntaxHighlighting) {
+    window.doSyntaxHighlighting();
+  }
+
+  // Auto-scroll runs both before and after padding update.
+  // Before: so content growth isn't hidden by padding absorption.
+  // After: so padding-added space is also scrolled into view.
+  autoScrollToBottom();
+  updateInstructPadding();
+  autoScrollToBottom();
+
   // Add toggle listeners for new blocks
   queryScope.querySelectorAll(".thinking-block").forEach(block => {
     if (!block._hasToggleListener) {
       block.addEventListener("toggle", function(e) {
+        const wasScrolled = window.isScrolled;
         if (this.open) {
           const content = this.querySelector(".thinking-content");
           if (content) {
@@ -385,6 +426,10 @@ function applyMorphdomUpdate(data) {
             }, 0);
           }
         }
+        updateInstructPadding();
+        // Restore scroll state so the browser's layout adjustment
+        // from the toggle doesn't disable auto-scroll
+        window.isScrolled = wasScrolled;
       });
       block._hasToggleListener = true;
     }
diff --git a/js/main.js b/js/main.js
index c3e51c3c..0cefaa6e 100644
--- a/js/main.js
+++ b/js/main.js
@@ -147,6 +147,7 @@ window.isScrolled = false;
 let scrollTimeout;
 let lastScrollTop = 0;
 let lastScrollHeight = 0;
+let lastClientHeight = 0;
 
 targetElement.addEventListener("scroll", function() {
   let diff = targetElement.scrollHeight - targetElement.clientHeight;
@@ -159,11 +160,12 @@ targetElement.addEventListener("scroll", function() {
 
   if(isAtBottomNow) {
     window.isScrolled = false;
-  } else if (targetElement.scrollTop < lastScrollTop && targetElement.scrollHeight >= lastScrollHeight) {
+  } else if (targetElement.scrollTop < lastScrollTop && targetElement.scrollHeight >= lastScrollHeight && targetElement.clientHeight <= lastClientHeight) {
     window.isScrolled = true;
   }
   lastScrollTop = targetElement.scrollTop;
   lastScrollHeight = targetElement.scrollHeight;
+  lastClientHeight = targetElement.clientHeight;
 
   // Clear previous timeout and set new one
   clearTimeout(scrollTimeout);
@@ -174,14 +176,7 @@ targetElement.addEventListener("scroll", function() {
 });
 
 // Create a MutationObserver instance
-const observer = new MutationObserver(function(mutations) {
-  // Check if this is just the scrolling class being toggled
-  const isScrollingClassOnly = mutations.every(mutation =>
-    mutation.type === "attributes" &&
-    mutation.attributeName === "class" &&
-    mutation.target === targetElement
-  );
-
+const observer = new MutationObserver(function() {
   if (targetElement.classList.contains("_generating")) {
     typing.parentNode.classList.add("visible-dots");
     document.getElementById("stop").style.display = "flex";
@@ -191,44 +186,11 @@ const observer = new MutationObserver(function(mutations) {
     document.getElementById("stop").style.display = "none";
     document.getElementById("Generate").style.display = "flex";
   }
-
-  doSyntaxHighlighting();
-
-  if (!window.isScrolled && !isScrollingClassOnly) {
-    const maxScroll = targetElement.scrollHeight - targetElement.clientHeight;
-    if (maxScroll > 0 && targetElement.scrollTop < maxScroll - 1) {
-      targetElement.scrollTop = maxScroll;
-    }
-  }
-
-  const chatElement = document.getElementById("chat");
-  if (chatElement && chatElement.getAttribute("data-mode") === "instruct") {
-    const messagesContainer = chatElement.querySelector(".messages");
-    const lastChild = messagesContainer?.lastElementChild;
-    const prevSibling = lastChild?.previousElementSibling;
-    if (lastChild && prevSibling) {
-      // Add padding to the messages container to create room for the last message.
-      // The purpose of this is to avoid constant scrolling during streaming in
-      // instruct mode.
-      let bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight);
-
-      // Subtract header height when screen width is <= 924px
-      if (window.innerWidth <= 924) {
-        bufferHeight = Math.max(0, bufferHeight - 32);
-      }
-
-      messagesContainer.style.paddingBottom = `${bufferHeight}px`;
-    }
-  }
 });
 
-// Configure the observer to watch for changes in the subtree and attributes
+// Only watch for attribute changes on targetElement (e.g. _generating class)
 const config = {
-  childList: true,
-  subtree: true,
-  characterData: true,
-  attributeOldValue: true,
-  characterDataOldValue: true
+  attributes: true
 };
 
 // Start observing the target element
@@ -247,55 +209,50 @@ function isElementVisibleOnScreen(element) {
   );
 }
 
-function doSyntaxHighlighting() {
+window.doSyntaxHighlighting = function() {
   const messageBodies = document.getElementById("chat").querySelectorAll(".message-body");
 
   if (messageBodies.length > 0) {
-    observer.disconnect();
+    let hasSeenVisible = false;
 
-    try {
-      let hasSeenVisible = false;
+    // Go from last message to first
+    for (let i = messageBodies.length - 1; i >= 0; i--) {
+      const messageBody = messageBodies[i];
 
-      // Go from last message to first
-      for (let i = messageBodies.length - 1; i >= 0; i--) {
-        const messageBody = messageBodies[i];
+      if (isElementVisibleOnScreen(messageBody)) {
+        hasSeenVisible = true;
 
-        if (isElementVisibleOnScreen(messageBody)) {
-          hasSeenVisible = true;
+        // Handle both code and math in a single pass through each message
+        const codeBlocks = messageBody.querySelectorAll("pre code:not([data-highlighted])");
+        codeBlocks.forEach((codeBlock) => {
+          hljs.highlightElement(codeBlock);
+          codeBlock.setAttribute("data-highlighted", "true");
+          codeBlock.classList.add("pretty_scrollbar");
+        });
 
-          // Handle both code and math in a single pass through each message
-          const codeBlocks = messageBody.querySelectorAll("pre code:not([data-highlighted])");
-          codeBlocks.forEach((codeBlock) => {
-            hljs.highlightElement(codeBlock);
-            codeBlock.setAttribute("data-highlighted", "true");
-            codeBlock.classList.add("pretty_scrollbar");
-          });
-
-          // Only render math in visible elements
-          const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
-          mathContainers.forEach(container => {
-            if (isElementVisibleOnScreen(container)) {
-              renderMathInElement(container, {
-                delimiters: [
-                  { left: "$$", right: "$$", display: true },
-                  { left: "$", right: "$", display: false },
-                  { left: "\\(", right: "\\)", display: false },
-                  { left: "\\[", right: "\\]", display: true },
-                ],
-              });
-            }
-          });
-        } else if (hasSeenVisible) {
-        // We've seen visible messages but this one is not visible
-        // Since we're going from last to first, we can break
-          break;
-        }
+        // Only render math in visible elements
+        const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
+        mathContainers.forEach(container => {
+          if (isElementVisibleOnScreen(container)) {
+            renderMathInElement(container, {
+              delimiters: [
+                { left: "$$", right: "$$", display: true },
+                { left: "$", right: "$", display: false },
+                { left: "\\(", right: "\\)", display: false },
+                { left: "\\[", right: "\\]", display: true },
+              ],
+            });
+          }
+        });
+      } else if (hasSeenVisible) {
+      // We've seen visible messages but this one is not visible
+      // Since we're going from last to first, we can break
+        break;
       }
-    } finally {
-      observer.observe(targetElement, config);
     }
   }
 }
+const doSyntaxHighlighting = window.doSyntaxHighlighting;
 
 //------------------------------------------------
 // Add some scrollbars

From 5362bbb4132ae5ddbed4c4dab739e7dd64c1e6ab Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 12:09:08 -0700
Subject: [PATCH 075/210] Make web_search not download the page contents, use
 fetch_webpage instead

---
 modules/ui_chat.py            | 10 ++++++++++
 modules/web_search.py         | 14 ++++++++++++--
 user_data/tools/web_search.py | 14 ++++++--------
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 039b9af6..ea341fa6 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -96,6 +96,16 @@ def create_ui():
                 shared.gradio['tools_refresh'] = gr.Button('Refresh list', elem_id='tools-refresh-btn', visible=False)
                 shared.gradio['tools_refresh'].click(fn=lambda: gr.update(choices=get_available_tools()), inputs=[], outputs=[shared.gradio['selected_tools']])
 
+                def sync_web_tools(selected):
+                    if 'web_search' in selected and 'fetch_webpage' not in selected:
+                        selected.append('fetch_webpage')
+                    elif 'web_search' not in selected and 'fetch_webpage' in selected:
+                        selected.remove('fetch_webpage')
+
+                    return gr.update(value=selected)
+
+                shared.gradio['selected_tools'].change(fn=sync_web_tools, inputs=[shared.gradio['selected_tools']], outputs=[shared.gradio['selected_tools']], show_progress=False)
+
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
                 with gr.Row():
diff --git a/modules/web_search.py b/modules/web_search.py
index b14cd042..754dd111 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -49,8 +49,8 @@ def download_web_page(url, timeout=10, include_links=False):
         return ""
 
 
-def perform_web_search(query, num_pages=3, max_workers=5, timeout=10):
-    """Perform web search and return results with content"""
+def perform_web_search(query, num_pages=3, max_workers=5, timeout=10, fetch_content=True):
+    """Perform web search and return results, optionally with page content"""
     try:
         search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
 
@@ -78,6 +78,16 @@ def perform_web_search(query, num_pages=3, max_workers=5, timeout=10):
 
         search_results = [None] * len(download_tasks)  # Pre-allocate to maintain order
 
+        if not fetch_content:
+            for url, title, index in download_tasks:
+                search_results[index] = {
+                    'title': title,
+                    'url': url,
+                    'content': ''
+                }
+
+            return search_results
+
         # Download pages in parallel
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
             # Submit all download tasks
diff --git a/user_data/tools/web_search.py b/user_data/tools/web_search.py
index 80845963..30d13473 100644
--- a/user_data/tools/web_search.py
+++ b/user_data/tools/web_search.py
@@ -1,16 +1,15 @@
-from modules.web_search import perform_web_search, truncate_content_by_tokens
+from modules.web_search import perform_web_search
 
 tool = {
     "type": "function",
     "function": {
         "name": "web_search",
-        "description": "Search the web using DuckDuckGo and return page contents.",
+        "description": "Search the web using DuckDuckGo and return a list of result titles and URLs. Use fetch_webpage to read the contents of a specific result.",
         "parameters": {
             "type": "object",
             "properties": {
                 "query": {"type": "string", "description": "The search query."},
-                "num_pages": {"type": "integer", "description": "Number of search result pages to fetch (default: 3)."},
-                "max_tokens": {"type": "integer", "description": "Maximum number of tokens per page result (default: 2048)."},
+                "num_pages": {"type": "integer", "description": "Number of search results to return (default: 3)."},
             },
             "required": ["query"]
         }
@@ -21,11 +20,10 @@ tool = {
 def execute(arguments):
     query = arguments.get("query", "")
     num_pages = arguments.get("num_pages", 3)
-    max_tokens = arguments.get("max_tokens", 2048)
-    results = perform_web_search(query, num_pages=num_pages)
+    results = perform_web_search(query, num_pages=num_pages, fetch_content=False)
     output = []
     for r in results:
-        if r and r["content"].strip():
-            output.append({"title": r["title"], "url": r["url"], "content": truncate_content_by_tokens(r["content"], max_tokens=max_tokens)})
+        if r:
+            output.append({"title": r["title"], "url": r["url"]})
 
     return output if output else [{"error": "No results found."}]

From cabb95f0d6077d44741eb2a3ee0587470ade6300 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 12:24:48 -0700
Subject: [PATCH 076/210] UI: Increase the instruct width to 768px

---
 css/html_instruct_style.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 72a148c3..d4780350 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -78,7 +78,7 @@
 
 .chat .user-message .text,
 .chat .assistant-message .text {
-    max-width: 700px;
+    max-width: 768px;
     margin-left: auto;
     margin-right: auto;
 }

From 24e7e77b55e7758f7bd07e07016cc88b8b188c8b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 12:37:10 -0700
Subject: [PATCH 077/210] Clean up

---
 modules/chat.py    | 2 +-
 modules/ui_chat.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 2c6f0ab2..87e52851 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1330,7 +1330,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
             # by the template (which adds its own channel markup).
             clean = content_prefix.strip()
             if '<|channel|>' in clean and '<|message|>' in clean:
-                inner = clean.split('<|message|>', 1)[1] if '<|message|>' in clean else clean
+                inner = clean.split('<|message|>', 1)[1]
                 if '<|end|>' in inner:
                     inner = inner.split('<|end|>', 1)[0]
                 clean = inner.strip()
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index ea341fa6..ce9fc0a2 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -99,8 +99,6 @@ def create_ui():
                 def sync_web_tools(selected):
                     if 'web_search' in selected and 'fetch_webpage' not in selected:
                         selected.append('fetch_webpage')
-                    elif 'web_search' not in selected and 'fetch_webpage' in selected:
-                        selected.remove('fetch_webpage')
 
                     return gr.update(value=selected)
 

From 0cd245bcbb46d894989e1ddd688d6640ba6ac537 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 12:58:56 -0700
Subject: [PATCH 078/210] UI: Make autoscroll more robust after the
 optimizations

---
 js/main.js | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/js/main.js b/js/main.js
index 0cefaa6e..a8bbbc71 100644
--- a/js/main.js
+++ b/js/main.js
@@ -181,6 +181,13 @@ const observer = new MutationObserver(function() {
     typing.parentNode.classList.add("visible-dots");
     document.getElementById("stop").style.display = "flex";
     document.getElementById("Generate").style.display = "none";
+    // If the user is near the bottom, ensure auto-scroll is enabled
+    // for the new reply. This catches cases where isScrolled was
+    // incorrectly set to true by layout shifts during page load, etc.
+    const diff = targetElement.scrollHeight - targetElement.clientHeight;
+    if (Math.abs(targetElement.scrollTop - diff) <= 10 || diff <= 0) {
+      window.isScrolled = false;
+    }
   } else {
     typing.parentNode.classList.remove("visible-dots");
     document.getElementById("stop").style.display = "none";

From cb88066d15c6ffa54774f805190f000951c05e84 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 13:17:41 -0700
Subject: [PATCH 079/210] Update llama.cpp

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 12 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index c702a8d3..8a0802f7 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.25/exllamav3-0.0.25+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.25/exllamav3-0.0.25+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 65a9aa00..9b31d668 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index bba62491..138639e5 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 61dbf51b..f3ebd171 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 384a552a..e32a2ed1 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 0e3d67d3..93eb3b85 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 729829b3..36e0e4d9 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 4b16414c..495bd5fa 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 3a1764dc..7e82f68d 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 9d115c86..046619e1 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 4472e1d4..590562f8 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index dad7ee9f..bf80deb0 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.92.0/llama_cpp_binaries-0.92.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From e8d1c663037666bafc0a45f4be0471a88fda4d57 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 18:13:12 -0700
Subject: [PATCH 080/210] Clean up tool calling code

---
 extensions/openai/completions.py |   7 +-
 extensions/openai/utils.py       | 558 -------------------------------
 modules/chat.py                  |  17 +-
 modules/tool_parsing.py          | 553 ++++++++++++++++++++++++++++++
 modules/tool_use.py              |   6 +-
 modules/ui_chat.py               |   2 +-
 modules/web_search.py            |  37 +-
 7 files changed, 604 insertions(+), 576 deletions(-)
 create mode 100644 modules/tool_parsing.py

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 290a5bc0..27defe42 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -11,7 +11,8 @@ from pydantic import ValidationError
 
 from extensions.openai.errors import InvalidRequestError
 from extensions.openai.typing import ToolDefinition
-from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
+from extensions.openai.utils import debug_msg
+from modules.tool_parsing import get_tool_call_id, parse_tool_call
 from modules import shared
 from modules.reasoning import extract_reasoning
 from modules.chat import (
@@ -491,10 +492,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         answer = a['internal'][-1][1]
 
         if supported_tools is not None:
-            tool_call = parseToolCall(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else []
+            tool_call = parse_tool_call(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else []
             if len(tool_call) > 0:
                 for tc in tool_call:
-                    tc["id"] = getToolCallId()
+                    tc["id"] = get_tool_call_id()
                     if stream:
                         tc["index"] = len(tool_calls)
                     tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py
index b179c267..2b414769 100644
--- a/extensions/openai/utils.py
+++ b/extensions/openai/utils.py
@@ -1,8 +1,5 @@
 import base64
-import json
 import os
-import random
-import re
 import time
 import traceback
 from typing import Callable, Optional
@@ -55,558 +52,3 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
             time.sleep(3)
 
         raise Exception('Could not start cloudflared.')
-
-
-def getToolCallId() -> str:
-    letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789"
-    b = [random.choice(letter_bytes) for _ in range(8)]
-    return "call_" + "".join(b).lower()
-
-
-def checkAndSanitizeToolCallCandidate(candidate_dict: dict, tool_names: list[str]):
-    # check if property 'function' exists and is a dictionary, otherwise adapt dict
-    if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str):
-        candidate_dict = {"type": "function", "function": candidate_dict}
-    if 'function' in candidate_dict and isinstance(candidate_dict['function'], str):
-        candidate_dict['name'] = candidate_dict['function']
-        del candidate_dict['function']
-        candidate_dict = {"type": "function", "function": candidate_dict}
-    if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict):
-        # check if 'name' exists within 'function' and is part of known tools
-        if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names:
-            candidate_dict["type"] = "function"  # ensure required property 'type' exists and has the right value
-            # map property 'parameters' used by some older models to 'arguments'
-            if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]:
-                candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"]
-                del candidate_dict["function"]["parameters"]
-            return candidate_dict
-    return None
-
-
-def _extractBalancedJson(text: str, start: int) -> str | None:
-    """Extract a balanced JSON object from text starting at the given position.
-
-    Walks through the string tracking brace depth and string boundaries
-    to correctly handle arbitrary nesting levels.
-    """
-    if start >= len(text) or text[start] != '{':
-        return None
-    depth = 0
-    in_string = False
-    escape_next = False
-    for i in range(start, len(text)):
-        c = text[i]
-        if escape_next:
-            escape_next = False
-            continue
-        if c == '\\' and in_string:
-            escape_next = True
-            continue
-        if c == '"':
-            in_string = not in_string
-            continue
-        if in_string:
-            continue
-        if c == '{':
-            depth += 1
-        elif c == '}':
-            depth -= 1
-            if depth == 0:
-                return text[start:i + 1]
-    return None
-
-
-def _parseChannelToolCalls(answer: str, tool_names: list[str]):
-    """Parse channel-based tool calls used by GPT-OSS and similar models.
-
-    Format:
-        <|start|>assistant to=functions.func_name<|channel|>commentary json<|message|>{"arg": "value"}
-    or:
-        <|channel|>commentary to=functions.func_name <|constrain|>json<|message|>{"arg": "value"}
-    """
-    matches = []
-    start_pos = None
-    # Pattern 1: to=functions.NAME before <|channel|> (GPT-OSS primary format)
-    # Pattern 2: to=functions.NAME after <|channel|> (alternative format)
-    patterns = [
-        r'to=functions\.([^<\s]+)\s*<\|channel\|>[^<]*<\|message\|>',
-        r'<\|channel\|>\w+ to=functions\.([^<\s]+).*?<\|message\|>',
-    ]
-    for pattern in patterns:
-        for m in re.finditer(pattern, answer):
-            func_name = m.group(1).strip()
-            if func_name not in tool_names:
-                continue
-            json_str = _extractBalancedJson(answer, m.end())
-            if json_str is None:
-                continue
-            try:
-                arguments = json.loads(json_str)
-                if start_pos is None:
-                    prefix = answer.rfind('<|start|>assistant', 0, m.start())
-                    start_pos = prefix if prefix != -1 else m.start()
-                matches.append({
-                    "type": "function",
-                    "function": {
-                        "name": func_name,
-                        "arguments": arguments
-                    }
-                })
-            except json.JSONDecodeError:
-                pass
-        if matches:
-            break
-    return matches, start_pos
-
-
-def _parseMistralTokenToolCalls(answer: str, tool_names: list[str]):
-    """Parse Mistral/Devstral-style tool calls with [TOOL_CALLS] and [ARGS] special tokens.
-
-    Format:
-        [TOOL_CALLS]func_name[ARGS]{"arg": "value"}
-    """
-    matches = []
-    start_pos = None
-    for m in re.finditer(
-        r'\[TOOL_CALLS\]\s*(\S+?)\s*\[ARGS\]\s*',
-        answer
-    ):
-        func_name = m.group(1).strip()
-        if func_name not in tool_names:
-            continue
-        json_str = _extractBalancedJson(answer, m.end())
-        if json_str is None:
-            continue
-        try:
-            arguments = json.loads(json_str)
-            if start_pos is None:
-                start_pos = m.start()
-            matches.append({
-                "type": "function",
-                "function": {
-                    "name": func_name,
-                    "arguments": arguments
-                }
-            })
-        except json.JSONDecodeError:
-            pass
-    return matches, start_pos
-
-
-def _parseBareNameToolCalls(answer: str, tool_names: list[str]):
-    """Parse bare function-name style tool calls used by Mistral and similar models.
-
-    Format:
-        functionName{"arg": "value"}
-    Multiple calls are concatenated directly or separated by whitespace.
-    """
-    matches = []
-    start_pos = None
-    # Match tool name followed by opening brace, then extract balanced JSON
-    escaped_names = [re.escape(name) for name in tool_names]
-    pattern = r'(?:' + '|'.join(escaped_names) + r')\s*\{'
-    for match in re.finditer(pattern, answer):
-        text = match.group(0)
-        name = None
-        for n in tool_names:
-            if text.startswith(n):
-                name = n
-                break
-        if not name:
-            continue
-        brace_start = match.end() - 1
-        json_str = _extractBalancedJson(answer, brace_start)
-        if json_str is None:
-            continue
-        try:
-            arguments = json.loads(json_str)
-            if start_pos is None:
-                start_pos = match.start()
-            matches.append({
-                "type": "function",
-                "function": {
-                    "name": name,
-                    "arguments": arguments
-                }
-            })
-        except json.JSONDecodeError:
-            pass
-    return matches, start_pos
-
-
-def _parseXmlParamToolCalls(answer: str, tool_names: list[str]):
-    """Parse XML-parameter style tool calls used by Qwen3.5 and similar models.
-
-    Format:
-        <tool_call>
-        <function=function_name>
-        <parameter=param_name>value</parameter>
-        </function>
-        </tool_call>
-    """
-    matches = []
-    start_pos = None
-    for tc_match in re.finditer(r'<tool_call>\s*(.*?)\s*</tool_call>', answer, re.DOTALL):
-        tc_content = tc_match.group(1)
-        func_match = re.search(r'<function=([^>]+)>', tc_content)
-        if not func_match:
-            continue
-        func_name = func_match.group(1).strip()
-        if func_name not in tool_names:
-            continue
-        arguments = {}
-        for param_match in re.finditer(r'<parameter=([^>]+)>\s*(.*?)\s*</parameter>', tc_content, re.DOTALL):
-            param_name = param_match.group(1).strip()
-            param_value = param_match.group(2).strip()
-            try:
-                param_value = json.loads(param_value)
-            except (json.JSONDecodeError, ValueError):
-                pass  # keep as string
-            arguments[param_name] = param_value
-        if start_pos is None:
-            start_pos = tc_match.start()
-        matches.append({
-            "type": "function",
-            "function": {
-                "name": func_name,
-                "arguments": arguments
-            }
-        })
-    return matches, start_pos
-
-
-def _parseKimiToolCalls(answer: str, tool_names: list[str]):
-    """Parse Kimi-K2-style tool calls using pipe-delimited tokens.
-
-    Format:
-        <|tool_calls_section_begin|>
-        <|tool_call_begin|>functions.func_name:index<|tool_call_argument_begin|>{"arg": "value"}<|tool_call_end|>
-        <|tool_calls_section_end|>
-    """
-    matches = []
-    start_pos = None
-    for m in re.finditer(
-        r'<\|tool_call_begin\|>\s*(?:functions\.)?(\S+?)(?::\d+)?\s*<\|tool_call_argument_begin\|>\s*',
-        answer
-    ):
-        func_name = m.group(1).strip()
-        if func_name not in tool_names:
-            continue
-        json_str = _extractBalancedJson(answer, m.end())
-        if json_str is None:
-            continue
-        try:
-            arguments = json.loads(json_str)
-            if start_pos is None:
-                # Check for section begin marker before the call marker
-                section = answer.rfind('<|tool_calls_section_begin|>', 0, m.start())
-                start_pos = section if section != -1 else m.start()
-            matches.append({
-                "type": "function",
-                "function": {
-                    "name": func_name,
-                    "arguments": arguments
-                }
-            })
-        except json.JSONDecodeError:
-            pass
-    return matches, start_pos
-
-
-def _parseMiniMaxToolCalls(answer: str, tool_names: list[str]):
-    """Parse MiniMax-style tool calls using invoke/parameter XML tags.
-
-    Format:
-        <minimax:tool_call>
-        <invoke name="function_name">
-        <parameter name="param_name">value</parameter>
-        </invoke>
-        </minimax:tool_call>
-    """
-    matches = []
-    start_pos = None
-    for tc_match in re.finditer(r'<minimax:tool_call>\s*(.*?)\s*</minimax:tool_call>', answer, re.DOTALL):
-        tc_content = tc_match.group(1)
-        # Split on <invoke> to handle multiple parallel calls in one block
-        for invoke_match in re.finditer(r'<invoke\s+name="([^"]+)">(.*?)</invoke>', tc_content, re.DOTALL):
-            func_name = invoke_match.group(1).strip()
-            if func_name not in tool_names:
-                continue
-            invoke_body = invoke_match.group(2)
-            arguments = {}
-            for param_match in re.finditer(r'<parameter\s+name="([^"]+)">\s*(.*?)\s*</parameter>', invoke_body, re.DOTALL):
-                param_name = param_match.group(1).strip()
-                param_value = param_match.group(2).strip()
-                try:
-                    param_value = json.loads(param_value)
-                except (json.JSONDecodeError, ValueError):
-                    pass  # keep as string
-                arguments[param_name] = param_value
-            if start_pos is None:
-                start_pos = tc_match.start()
-            matches.append({
-                "type": "function",
-                "function": {
-                    "name": func_name,
-                    "arguments": arguments
-                }
-            })
-    return matches, start_pos
-
-
-def _parseDeepSeekToolCalls(answer: str, tool_names: list[str]):
-    """Parse DeepSeek-style tool calls using fullwidth Unicode token delimiters.
-
-    Format:
-        <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>func_name<｜tool▁sep｜>{"arg": "value"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>
-    """
-    matches = []
-    start_pos = None
-    for m in re.finditer(
-        r'<｜tool▁call▁begin｜>\s*(\S+?)\s*<｜tool▁sep｜>\s*',
-        answer
-    ):
-        func_name = m.group(1).strip()
-        if func_name not in tool_names:
-            continue
-        json_str = _extractBalancedJson(answer, m.end())
-        if json_str is None:
-            continue
-        try:
-            arguments = json.loads(json_str)
-            if start_pos is None:
-                # Check for section begin marker before the call marker
-                section = answer.rfind('<｜tool▁calls▁begin｜>', 0, m.start())
-                start_pos = section if section != -1 else m.start()
-            matches.append({
-                "type": "function",
-                "function": {
-                    "name": func_name,
-                    "arguments": arguments
-                }
-            })
-        except json.JSONDecodeError:
-            pass
-    return matches, start_pos
-
-
-def _parseGlmToolCalls(answer: str, tool_names: list[str]):
-    """Parse GLM-style tool calls using arg_key/arg_value XML pairs.
-
-    Format:
-        <tool_call>function_name
-        <arg_key>key1</arg_key>
-        <arg_value>value1</arg_value>
-        </tool_call>
-    """
-    matches = []
-    start_pos = None
-    for tc_match in re.finditer(r'<tool_call>\s*(.*?)\s*</tool_call>', answer, re.DOTALL):
-        tc_content = tc_match.group(1)
-        # First non-tag text is the function name
-        name_match = re.match(r'([^<\s]+)', tc_content.strip())
-        if not name_match:
-            continue
-        func_name = name_match.group(1).strip()
-        if func_name not in tool_names:
-            continue
-        # Extract arg_key/arg_value pairs
-        keys = [k.group(1).strip() for k in re.finditer(r'<arg_key>\s*(.*?)\s*</arg_key>', tc_content, re.DOTALL)]
-        vals = [v.group(1).strip() for v in re.finditer(r'<arg_value>\s*(.*?)\s*</arg_value>', tc_content, re.DOTALL)]
-        if len(keys) != len(vals):
-            continue
-        arguments = {}
-        for k, v in zip(keys, vals):
-            try:
-                v = json.loads(v)
-            except (json.JSONDecodeError, ValueError):
-                pass  # keep as string
-            arguments[k] = v
-        if start_pos is None:
-            start_pos = tc_match.start()
-        matches.append({
-            "type": "function",
-            "function": {
-                "name": func_name,
-                "arguments": arguments
-            }
-        })
-    return matches, start_pos
-
-
-def _parsePythonicToolCalls(answer: str, tool_names: list[str]):
-    """Parse pythonic-style tool calls used by Llama 4 and similar models.
-
-    Format:
-        [func_name(param1="value1", param2="value2"), func_name2(...)]
-    """
-    matches = []
-    start_pos = None
-    # Match a bracketed list of function calls
-    bracket_match = re.search(r'\[([^\[\]]+)\]', answer)
-    if not bracket_match:
-        return matches, start_pos
-
-    inner = bracket_match.group(1)
-
-    # Build pattern for known tool names
-    escaped_names = [re.escape(name) for name in tool_names]
-    name_pattern = '|'.join(escaped_names)
-
-    for call_match in re.finditer(
-        r'(' + name_pattern + r')\(([^)]*)\)',
-        inner
-    ):
-        func_name = call_match.group(1)
-        params_str = call_match.group(2).strip()
-        arguments = {}
-
-        if params_str:
-            # Parse key="value" pairs, handling commas inside quoted values
-            for param_match in re.finditer(
-                r'(\w+)\s*=\s*("(?:[^"\\]|\\.)*"|\'(?:[^\'\\]|\\.)*\'|[^,\)]+)',
-                params_str
-            ):
-                param_name = param_match.group(1)
-                param_value = param_match.group(2).strip()
-                # Strip surrounding quotes
-                if (param_value.startswith('"') and param_value.endswith('"')) or \
-                   (param_value.startswith("'") and param_value.endswith("'")):
-                    param_value = param_value[1:-1]
-                # Try to parse as JSON for numeric/bool/null values
-                try:
-                    param_value = json.loads(param_value)
-                except (json.JSONDecodeError, ValueError):
-                    pass
-                arguments[param_name] = param_value
-
-        if start_pos is None:
-            start_pos = bracket_match.start()
-        matches.append({
-            "type": "function",
-            "function": {
-                "name": func_name,
-                "arguments": arguments
-            }
-        })
-
-    return matches, start_pos
-
-
-def parseToolCall(answer: str, tool_names: list[str], return_prefix: bool = False):
-    matches = []
-    start_pos = None
-
-    def _return(matches, start_pos):
-        if return_prefix:
-            prefix = answer[:start_pos] if matches and start_pos is not None else ''
-            return matches, prefix
-        return matches
-
-    # abort on very short answers to save computation cycles
-    if len(answer) < 10:
-        return _return(matches, start_pos)
-
-    # Check for DeepSeek-style tool calls (fullwidth Unicode token delimiters)
-    matches, start_pos = _parseDeepSeekToolCalls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for Kimi-K2-style tool calls (pipe-delimited tokens)
-    matches, start_pos = _parseKimiToolCalls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for channel-based tool calls (e.g. GPT-OSS format)
-    matches, start_pos = _parseChannelToolCalls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for MiniMax-style tool calls (invoke/parameter XML tags)
-    matches, start_pos = _parseMiniMaxToolCalls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for GLM-style tool calls (arg_key/arg_value XML pairs)
-    matches, start_pos = _parseGlmToolCalls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for XML-parameter style tool calls (e.g. Qwen3.5 format)
-    matches, start_pos = _parseXmlParamToolCalls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for Mistral/Devstral-style tool calls ([TOOL_CALLS]name[ARGS]json)
-    matches, start_pos = _parseMistralTokenToolCalls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for bare function-name style tool calls (e.g. Mistral format)
-    matches, start_pos = _parseBareNameToolCalls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for pythonic-style tool calls (e.g. Llama 4 format)
-    matches, start_pos = _parsePythonicToolCalls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
-    patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
-
-    for pattern in patterns:
-        for match in re.finditer(pattern, answer, re.DOTALL):
-            # print(match.group(2))
-            if match.group(2) is None:
-                continue
-            # remove backtick wraps if present
-            candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip())
-            candidate = re.sub(r"```$", "", candidate.strip())
-            # unwrap inner tags
-            candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL)
-            # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
-            if re.search(r"\}\s*\n\s*\{", candidate) is not None:
-                candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
-            if not candidate.strip().startswith("["):
-                candidate = "[" + candidate + "]"
-
-            candidates = []
-            try:
-                # parse the candidate JSON into a dictionary
-                candidates = json.loads(candidate)
-                if not isinstance(candidates, list):
-                    candidates = [candidates]
-            except json.JSONDecodeError:
-                # Ignore invalid JSON silently
-                continue
-
-            for candidate_dict in candidates:
-                checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
-                if checked_candidate is not None:
-                    if start_pos is None:
-                        start_pos = match.start()
-                    matches.append(checked_candidate)
-
-        # last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags
-        if len(matches) == 0:
-            try:
-                candidate = answer
-                # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
-                if re.search(r"\}\s*\n\s*\{", candidate) is not None:
-                    candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
-                if not candidate.strip().startswith("["):
-                    candidate = "[" + candidate + "]"
-                # parse the candidate JSON into a dictionary
-                candidates = json.loads(candidate)
-                if not isinstance(candidates, list):
-                    candidates = [candidates]
-                for candidate_dict in candidates:
-                    checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
-                    if checked_candidate is not None:
-                        matches.append(checked_candidate)
-            except json.JSONDecodeError:
-                # Ignore invalid JSON silently
-                pass
-
-    return _return(matches, start_pos)
diff --git a/modules/chat.py b/modules/chat.py
index 87e52851..02ae46e4 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -239,6 +239,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
         name1=state['name1'],
         name2=state['name2'],
         user_bio=replace_character_names(state['user_bio'], state['name1'], state['name2']),
+        tools=state['tools'] if 'tools' in state else None,
     )
 
     messages = []
@@ -1186,14 +1187,10 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
 
     # Load tools if any are selected
     selected = state.get('selected_tools', [])
-    parseToolCall = None
+    parse_tool_call = None
     if selected:
         from modules.tool_use import load_tools, execute_tool
-        try:
-            from extensions.openai.utils import parseToolCall, getToolCallId
-        except ImportError:
-            logger.warning('Tool calling requires the openai extension for parseToolCall. Disabling tools.')
-            selected = []
+        from modules.tool_parsing import parse_tool_call, get_tool_call_id
 
     if selected:
         tool_defs, tool_executors = load_tools(selected)
@@ -1253,7 +1250,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
                 last_save_time = current_time
 
             # Early stop on tool call detection
-            if tool_func_names and parseToolCall(history['internal'][-1][1], tool_func_names):
+            if tool_func_names and parse_tool_call(history['internal'][-1][1], tool_func_names):
                 break
 
         # Save the model's visible output before re-applying visible_prefix,
@@ -1285,7 +1282,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
             break
 
         answer = history['internal'][-1][1]
-        parsed_calls, content_prefix = parseToolCall(answer, tool_func_names, return_prefix=True) if answer else (None, '')
+        parsed_calls, content_prefix = parse_tool_call(answer, tool_func_names, return_prefix=True) if answer else (None, '')
 
         if not parsed_calls:
             break  # No tool calls — done
@@ -1302,7 +1299,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         serialized = []
         tc_headers = []
         for tc in parsed_calls:
-            tc['id'] = getToolCallId()
+            tc['id'] = get_tool_call_id()
             fn_name = tc['function']['name']
             fn_args = tc['function'].get('arguments', {})
 
@@ -1343,7 +1340,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
 
         # Preserve thinking block and intermediate text from this turn.
         # content_prefix is the raw text before tool call syntax (returned
-        # by parseToolCall); HTML-escape it and extract thinking to get
+        # by parse_tool_call); HTML-escape it and extract thinking to get
         # the content the user should see.
         content_text = html.escape(content_prefix)
         thinking_content, intermediate = extract_thinking_block(content_text)
diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
new file mode 100644
index 00000000..460188d3
--- /dev/null
+++ b/modules/tool_parsing.py
@@ -0,0 +1,553 @@
+import json
+import random
+import re
+
+
+def get_tool_call_id() -> str:
+    letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789"
+    b = [random.choice(letter_bytes) for _ in range(8)]
+    return "call_" + "".join(b).lower()
+
+
+def check_and_sanitize_tool_call_candidate(candidate_dict: dict, tool_names: list[str]):
+    # check if property 'function' exists and is a dictionary, otherwise adapt dict
+    if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str):
+        candidate_dict = {"type": "function", "function": candidate_dict}
+    if 'function' in candidate_dict and isinstance(candidate_dict['function'], str):
+        candidate_dict['name'] = candidate_dict['function']
+        del candidate_dict['function']
+        candidate_dict = {"type": "function", "function": candidate_dict}
+    if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict):
+        # check if 'name' exists within 'function' and is part of known tools
+        if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names:
+            candidate_dict["type"] = "function"  # ensure required property 'type' exists and has the right value
+            # map property 'parameters' used by some older models to 'arguments'
+            if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]:
+                candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"]
+                del candidate_dict["function"]["parameters"]
+            return candidate_dict
+    return None
+
+
+def _extract_balanced_json(text: str, start: int) -> str | None:
+    """Extract a balanced JSON object from text starting at the given position.
+
+    Walks through the string tracking brace depth and string boundaries
+    to correctly handle arbitrary nesting levels.
+    """
+    if start >= len(text) or text[start] != '{':
+        return None
+    depth = 0
+    in_string = False
+    escape_next = False
+    for i in range(start, len(text)):
+        c = text[i]
+        if escape_next:
+            escape_next = False
+            continue
+        if c == '\\' and in_string:
+            escape_next = True
+            continue
+        if c == '"':
+            in_string = not in_string
+            continue
+        if in_string:
+            continue
+        if c == '{':
+            depth += 1
+        elif c == '}':
+            depth -= 1
+            if depth == 0:
+                return text[start:i + 1]
+    return None
+
+
+def _parse_channel_tool_calls(answer: str, tool_names: list[str]):
+    """Parse channel-based tool calls used by GPT-OSS and similar models.
+
+    Format:
+        <|start|>assistant to=functions.func_name<|channel|>commentary json<|message|>{"arg": "value"}
+    or:
+        <|channel|>commentary to=functions.func_name <|constrain|>json<|message|>{"arg": "value"}
+    """
+    matches = []
+    start_pos = None
+    # Pattern 1: to=functions.NAME before <|channel|> (GPT-OSS primary format)
+    # Pattern 2: to=functions.NAME after <|channel|> (alternative format)
+    patterns = [
+        r'to=functions\.([^<\s]+)\s*<\|channel\|>[^<]*<\|message\|>',
+        r'<\|channel\|>\w+ to=functions\.([^<\s]+).*?<\|message\|>',
+    ]
+    for pattern in patterns:
+        for m in re.finditer(pattern, answer):
+            func_name = m.group(1).strip()
+            if func_name not in tool_names:
+                continue
+            json_str = _extract_balanced_json(answer, m.end())
+            if json_str is None:
+                continue
+            try:
+                arguments = json.loads(json_str)
+                if start_pos is None:
+                    prefix = answer.rfind('<|start|>assistant', 0, m.start())
+                    start_pos = prefix if prefix != -1 else m.start()
+                matches.append({
+                    "type": "function",
+                    "function": {
+                        "name": func_name,
+                        "arguments": arguments
+                    }
+                })
+            except json.JSONDecodeError:
+                pass
+        if matches:
+            break
+    return matches, start_pos
+
+
+def _parse_mistral_token_tool_calls(answer: str, tool_names: list[str]):
+    """Parse Mistral/Devstral-style tool calls with [TOOL_CALLS] and [ARGS] special tokens.
+
+    Format:
+        [TOOL_CALLS]func_name[ARGS]{"arg": "value"}
+    """
+    matches = []
+    start_pos = None
+    for m in re.finditer(
+        r'\[TOOL_CALLS\]\s*(\S+?)\s*\[ARGS\]\s*',
+        answer
+    ):
+        func_name = m.group(1).strip()
+        if func_name not in tool_names:
+            continue
+        json_str = _extract_balanced_json(answer, m.end())
+        if json_str is None:
+            continue
+        try:
+            arguments = json.loads(json_str)
+            if start_pos is None:
+                start_pos = m.start()
+            matches.append({
+                "type": "function",
+                "function": {
+                    "name": func_name,
+                    "arguments": arguments
+                }
+            })
+        except json.JSONDecodeError:
+            pass
+    return matches, start_pos
+
+
+def _parse_bare_name_tool_calls(answer: str, tool_names: list[str]):
+    """Parse bare function-name style tool calls used by Mistral and similar models.
+
+    Format:
+        functionName{"arg": "value"}
+    Multiple calls are concatenated directly or separated by whitespace.
+    """
+    matches = []
+    start_pos = None
+    # Match tool name followed by opening brace, then extract balanced JSON
+    escaped_names = [re.escape(name) for name in tool_names]
+    pattern = r'(?:' + '|'.join(escaped_names) + r')\s*\{'
+    for match in re.finditer(pattern, answer):
+        text = match.group(0)
+        name = None
+        for n in tool_names:
+            if text.startswith(n):
+                name = n
+                break
+        if not name:
+            continue
+        brace_start = match.end() - 1
+        json_str = _extract_balanced_json(answer, brace_start)
+        if json_str is None:
+            continue
+        try:
+            arguments = json.loads(json_str)
+            if start_pos is None:
+                start_pos = match.start()
+            matches.append({
+                "type": "function",
+                "function": {
+                    "name": name,
+                    "arguments": arguments
+                }
+            })
+        except json.JSONDecodeError:
+            pass
+    return matches, start_pos
+
+
+def _parse_xml_param_tool_calls(answer: str, tool_names: list[str]):
+    """Parse XML-parameter style tool calls used by Qwen3.5 and similar models.
+
+    Format:
+        <tool_call>
+        <function=function_name>
+        <parameter=param_name>value</parameter>
+        </function>
+        </tool_call>
+    """
+    matches = []
+    start_pos = None
+    for tc_match in re.finditer(r'<tool_call>\s*(.*?)\s*</tool_call>', answer, re.DOTALL):
+        tc_content = tc_match.group(1)
+        func_match = re.search(r'<function=([^>]+)>', tc_content)
+        if not func_match:
+            continue
+        func_name = func_match.group(1).strip()
+        if func_name not in tool_names:
+            continue
+        arguments = {}
+        for param_match in re.finditer(r'<parameter=([^>]+)>\s*(.*?)\s*</parameter>', tc_content, re.DOTALL):
+            param_name = param_match.group(1).strip()
+            param_value = param_match.group(2).strip()
+            try:
+                param_value = json.loads(param_value)
+            except (json.JSONDecodeError, ValueError):
+                pass  # keep as string
+            arguments[param_name] = param_value
+        if start_pos is None:
+            start_pos = tc_match.start()
+        matches.append({
+            "type": "function",
+            "function": {
+                "name": func_name,
+                "arguments": arguments
+            }
+        })
+    return matches, start_pos
+
+
+def _parse_kimi_tool_calls(answer: str, tool_names: list[str]):
+    """Parse Kimi-K2-style tool calls using pipe-delimited tokens.
+
+    Format:
+        <|tool_calls_section_begin|>
+        <|tool_call_begin|>functions.func_name:index<|tool_call_argument_begin|>{"arg": "value"}<|tool_call_end|>
+        <|tool_calls_section_end|>
+    """
+    matches = []
+    start_pos = None
+    for m in re.finditer(
+        r'<\|tool_call_begin\|>\s*(?:functions\.)?(\S+?)(?::\d+)?\s*<\|tool_call_argument_begin\|>\s*',
+        answer
+    ):
+        func_name = m.group(1).strip()
+        if func_name not in tool_names:
+            continue
+        json_str = _extract_balanced_json(answer, m.end())
+        if json_str is None:
+            continue
+        try:
+            arguments = json.loads(json_str)
+            if start_pos is None:
+                # Check for section begin marker before the call marker
+                section = answer.rfind('<|tool_calls_section_begin|>', 0, m.start())
+                start_pos = section if section != -1 else m.start()
+            matches.append({
+                "type": "function",
+                "function": {
+                    "name": func_name,
+                    "arguments": arguments
+                }
+            })
+        except json.JSONDecodeError:
+            pass
+    return matches, start_pos
+
+
+def _parse_minimax_tool_calls(answer: str, tool_names: list[str]):
+    """Parse MiniMax-style tool calls using invoke/parameter XML tags.
+
+    Format:
+        <minimax:tool_call>
+        <invoke name="function_name">
+        <parameter name="param_name">value</parameter>
+        </invoke>
+        </minimax:tool_call>
+    """
+    matches = []
+    start_pos = None
+    for tc_match in re.finditer(r'<minimax:tool_call>\s*(.*?)\s*</minimax:tool_call>', answer, re.DOTALL):
+        tc_content = tc_match.group(1)
+        # Split on <invoke> to handle multiple parallel calls in one block
+        for invoke_match in re.finditer(r'<invoke\s+name="([^"]+)">(.*?)</invoke>', tc_content, re.DOTALL):
+            func_name = invoke_match.group(1).strip()
+            if func_name not in tool_names:
+                continue
+            invoke_body = invoke_match.group(2)
+            arguments = {}
+            for param_match in re.finditer(r'<parameter\s+name="([^"]+)">\s*(.*?)\s*</parameter>', invoke_body, re.DOTALL):
+                param_name = param_match.group(1).strip()
+                param_value = param_match.group(2).strip()
+                try:
+                    param_value = json.loads(param_value)
+                except (json.JSONDecodeError, ValueError):
+                    pass  # keep as string
+                arguments[param_name] = param_value
+            if start_pos is None:
+                start_pos = tc_match.start()
+            matches.append({
+                "type": "function",
+                "function": {
+                    "name": func_name,
+                    "arguments": arguments
+                }
+            })
+    return matches, start_pos
+
+
+def _parse_deep_seek_tool_calls(answer: str, tool_names: list[str]):
+    """Parse DeepSeek-style tool calls using fullwidth Unicode token delimiters.
+
+    Format:
+        <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>func_name<｜tool▁sep｜>{"arg": "value"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>
+    """
+    matches = []
+    start_pos = None
+    for m in re.finditer(
+        r'<｜tool▁call▁begin｜>\s*(\S+?)\s*<｜tool▁sep｜>\s*',
+        answer
+    ):
+        func_name = m.group(1).strip()
+        if func_name not in tool_names:
+            continue
+        json_str = _extract_balanced_json(answer, m.end())
+        if json_str is None:
+            continue
+        try:
+            arguments = json.loads(json_str)
+            if start_pos is None:
+                # Check for section begin marker before the call marker
+                section = answer.rfind('<｜tool▁calls▁begin｜>', 0, m.start())
+                start_pos = section if section != -1 else m.start()
+            matches.append({
+                "type": "function",
+                "function": {
+                    "name": func_name,
+                    "arguments": arguments
+                }
+            })
+        except json.JSONDecodeError:
+            pass
+    return matches, start_pos
+
+
+def _parse_glm_tool_calls(answer: str, tool_names: list[str]):
+    """Parse GLM-style tool calls using arg_key/arg_value XML pairs.
+
+    Format:
+        <tool_call>function_name
+        <arg_key>key1</arg_key>
+        <arg_value>value1</arg_value>
+        </tool_call>
+    """
+    matches = []
+    start_pos = None
+    for tc_match in re.finditer(r'<tool_call>\s*(.*?)\s*</tool_call>', answer, re.DOTALL):
+        tc_content = tc_match.group(1)
+        # First non-tag text is the function name
+        name_match = re.match(r'([^<\s]+)', tc_content.strip())
+        if not name_match:
+            continue
+        func_name = name_match.group(1).strip()
+        if func_name not in tool_names:
+            continue
+        # Extract arg_key/arg_value pairs
+        keys = [k.group(1).strip() for k in re.finditer(r'<arg_key>\s*(.*?)\s*</arg_key>', tc_content, re.DOTALL)]
+        vals = [v.group(1).strip() for v in re.finditer(r'<arg_value>\s*(.*?)\s*</arg_value>', tc_content, re.DOTALL)]
+        if len(keys) != len(vals):
+            continue
+        arguments = {}
+        for k, v in zip(keys, vals):
+            try:
+                v = json.loads(v)
+            except (json.JSONDecodeError, ValueError):
+                pass  # keep as string
+            arguments[k] = v
+        if start_pos is None:
+            start_pos = tc_match.start()
+        matches.append({
+            "type": "function",
+            "function": {
+                "name": func_name,
+                "arguments": arguments
+            }
+        })
+    return matches, start_pos
+
+
+def _parse_pythonic_tool_calls(answer: str, tool_names: list[str]):
+    """Parse pythonic-style tool calls used by Llama 4 and similar models.
+
+    Format:
+        [func_name(param1="value1", param2="value2"), func_name2(...)]
+    """
+    matches = []
+    start_pos = None
+    # Match a bracketed list of function calls
+    bracket_match = re.search(r'\[([^\[\]]+)\]', answer)
+    if not bracket_match:
+        return matches, start_pos
+
+    inner = bracket_match.group(1)
+
+    # Build pattern for known tool names
+    escaped_names = [re.escape(name) for name in tool_names]
+    name_pattern = '|'.join(escaped_names)
+
+    for call_match in re.finditer(
+        r'(' + name_pattern + r')\(([^)]*)\)',
+        inner
+    ):
+        func_name = call_match.group(1)
+        params_str = call_match.group(2).strip()
+        arguments = {}
+
+        if params_str:
+            # Parse key="value" pairs, handling commas inside quoted values
+            for param_match in re.finditer(
+                r'(\w+)\s*=\s*("(?:[^"\\]|\\.)*"|\'(?:[^\'\\]|\\.)*\'|[^,\)]+)',
+                params_str
+            ):
+                param_name = param_match.group(1)
+                param_value = param_match.group(2).strip()
+                # Strip surrounding quotes
+                if (param_value.startswith('"') and param_value.endswith('"')) or \
+                   (param_value.startswith("'") and param_value.endswith("'")):
+                    param_value = param_value[1:-1]
+                # Try to parse as JSON for numeric/bool/null values
+                try:
+                    param_value = json.loads(param_value)
+                except (json.JSONDecodeError, ValueError):
+                    pass
+                arguments[param_name] = param_value
+
+        if start_pos is None:
+            start_pos = bracket_match.start()
+        matches.append({
+            "type": "function",
+            "function": {
+                "name": func_name,
+                "arguments": arguments
+            }
+        })
+
+    return matches, start_pos
+
+
+def parse_tool_call(answer: str, tool_names: list[str], return_prefix: bool = False):
+    matches = []
+    start_pos = None
+
+    def _return(matches, start_pos):
+        if return_prefix:
+            prefix = answer[:start_pos] if matches and start_pos is not None else ''
+            return matches, prefix
+        return matches
+
+    # Check for DeepSeek-style tool calls (fullwidth Unicode token delimiters)
+    matches, start_pos = _parse_deep_seek_tool_calls(answer, tool_names)
+    if matches:
+        return _return(matches, start_pos)
+
+    # Check for Kimi-K2-style tool calls (pipe-delimited tokens)
+    matches, start_pos = _parse_kimi_tool_calls(answer, tool_names)
+    if matches:
+        return _return(matches, start_pos)
+
+    # Check for channel-based tool calls (e.g. GPT-OSS format)
+    matches, start_pos = _parse_channel_tool_calls(answer, tool_names)
+    if matches:
+        return _return(matches, start_pos)
+
+    # Check for MiniMax-style tool calls (invoke/parameter XML tags)
+    matches, start_pos = _parse_minimax_tool_calls(answer, tool_names)
+    if matches:
+        return _return(matches, start_pos)
+
+    # Check for GLM-style tool calls (arg_key/arg_value XML pairs)
+    matches, start_pos = _parse_glm_tool_calls(answer, tool_names)
+    if matches:
+        return _return(matches, start_pos)
+
+    # Check for XML-parameter style tool calls (e.g. Qwen3.5 format)
+    matches, start_pos = _parse_xml_param_tool_calls(answer, tool_names)
+    if matches:
+        return _return(matches, start_pos)
+
+    # Check for Mistral/Devstral-style tool calls ([TOOL_CALLS]name[ARGS]json)
+    matches, start_pos = _parse_mistral_token_tool_calls(answer, tool_names)
+    if matches:
+        return _return(matches, start_pos)
+
+    # Check for bare function-name style tool calls (e.g. Mistral format)
+    matches, start_pos = _parse_bare_name_tool_calls(answer, tool_names)
+    if matches:
+        return _return(matches, start_pos)
+
+    # Check for pythonic-style tool calls (e.g. Llama 4 format)
+    matches, start_pos = _parse_pythonic_tool_calls(answer, tool_names)
+    if matches:
+        return _return(matches, start_pos)
+
+    # Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
+    patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
+
+    for pattern in patterns:
+        for match in re.finditer(pattern, answer, re.DOTALL):
+            if match.group(2) is None:
+                continue
+            # remove backtick wraps if present
+            candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip())
+            candidate = re.sub(r"```$", "", candidate.strip())
+            # unwrap inner tags
+            candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL)
+            # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
+            if re.search(r"\}\s*\n\s*\{", candidate) is not None:
+                candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
+            if not candidate.strip().startswith("["):
+                candidate = "[" + candidate + "]"
+
+            candidates = []
+            try:
+                # parse the candidate JSON into a dictionary
+                candidates = json.loads(candidate)
+                if not isinstance(candidates, list):
+                    candidates = [candidates]
+            except json.JSONDecodeError:
+                # Ignore invalid JSON silently
+                continue
+
+            for candidate_dict in candidates:
+                checked_candidate = check_and_sanitize_tool_call_candidate(candidate_dict, tool_names)
+                if checked_candidate is not None:
+                    if start_pos is None:
+                        start_pos = match.start()
+                    matches.append(checked_candidate)
+
+        # last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags
+        if len(matches) == 0:
+            try:
+                candidate = answer
+                # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
+                if re.search(r"\}\s*\n\s*\{", candidate) is not None:
+                    candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
+                if not candidate.strip().startswith("["):
+                    candidate = "[" + candidate + "]"
+                # parse the candidate JSON into a dictionary
+                candidates = json.loads(candidate)
+                if not isinstance(candidates, list):
+                    candidates = [candidates]
+                for candidate_dict in candidates:
+                    checked_candidate = check_and_sanitize_tool_call_candidate(candidate_dict, tool_names)
+                    if checked_candidate is not None:
+                        matches.append(checked_candidate)
+            except json.JSONDecodeError:
+                # Ignore invalid JSON silently
+                pass
+
+    return _return(matches, start_pos)
diff --git a/modules/tool_use.py b/modules/tool_use.py
index 55424853..e22b1798 100644
--- a/modules/tool_use.py
+++ b/modules/tool_use.py
@@ -3,7 +3,7 @@ import json
 
 from modules import shared
 from modules.logging_colors import logger
-from modules.utils import natural_keys
+from modules.utils import natural_keys, sanitize_filename
 
 
 def get_available_tools():
@@ -23,6 +23,10 @@ def load_tools(selected_names):
     tool_defs = []
     executors = {}
     for name in selected_names:
+        name = sanitize_filename(name)
+        if not name:
+            continue
+
         path = shared.user_data_dir / 'tools' / f'{name}.py'
         if not path.exists():
             continue
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index ce9fc0a2..0acf9c04 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -97,7 +97,7 @@ def create_ui():
                 shared.gradio['tools_refresh'].click(fn=lambda: gr.update(choices=get_available_tools()), inputs=[], outputs=[shared.gradio['selected_tools']])
 
                 def sync_web_tools(selected):
-                    if 'web_search' in selected and 'fetch_webpage' not in selected:
+                    if 'web_search' in selected and 'fetch_webpage' not in selected and 'fetch_webpage' in get_available_tools():
                         selected.append('fetch_webpage')
 
                     return gr.update(value=selected)
diff --git a/modules/web_search.py b/modules/web_search.py
index 754dd111..216d7933 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -1,11 +1,13 @@
 import concurrent.futures
 import html
+import ipaddress
 import random
 import re
+import socket
 import urllib.request
 from concurrent.futures import as_completed
 from datetime import datetime
-from urllib.parse import quote_plus
+from urllib.parse import quote_plus, urlparse
 
 import requests
 
@@ -13,6 +15,26 @@ from modules import shared
 from modules.logging_colors import logger
 
 
+def _validate_url(url):
+    """Validate that a URL is safe to fetch (not targeting private/internal networks)."""
+    parsed = urlparse(url)
+    if parsed.scheme not in ('http', 'https'):
+        raise ValueError(f"Unsupported URL scheme: {parsed.scheme}")
+
+    hostname = parsed.hostname
+    if not hostname:
+        raise ValueError("No hostname in URL")
+
+    # Resolve hostname and check all returned addresses
+    try:
+        for family, _, _, _, sockaddr in socket.getaddrinfo(hostname, None):
+            ip = ipaddress.ip_address(sockaddr[0])
+            if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
+                raise ValueError(f"Access to private/internal address {ip} is blocked")
+    except socket.gaierror:
+        raise ValueError(f"Could not resolve hostname: {hostname}")
+
+
 def get_current_timestamp():
     """Returns the current time in 24-hour format"""
     return datetime.now().strftime('%b %d, %Y %H:%M')
@@ -25,11 +47,20 @@ def download_web_page(url, timeout=10, include_links=False):
     import html2text
 
     try:
+        _validate_url(url)
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
-        response = requests.get(url, headers=headers, timeout=timeout)
-        response.raise_for_status()  # Raise an exception for bad status codes
+        max_redirects = 5
+        for _ in range(max_redirects):
+            response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=False)
+            if response.is_redirect and 'Location' in response.headers:
+                url = response.headers['Location']
+                _validate_url(url)
+            else:
+                break
+
+        response.raise_for_status()
 
         # Initialize the HTML to Markdown converter
         h = html2text.HTML2Text()

From 16636c04b88df924b1af0a79dfe5cd574aa33753 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 19:06:04 -0700
Subject: [PATCH 081/210] UI: Minor fix/optimization

---
 js/main.js | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/js/main.js b/js/main.js
index a8bbbc71..5f79c3ec 100644
--- a/js/main.js
+++ b/js/main.js
@@ -1075,15 +1075,13 @@ document.fonts.addEventListener("loadingdone", (event) => {
     const currentHeight = chatInputRow.offsetHeight;
     const heightDifference = currentHeight - originalHeight;
     chatParent.style.marginBottom = `${originalMarginBottom + heightDifference}px`;
+    if (!window.isScrolled) {
+      chatParent.scrollTop = chatParent.scrollHeight - chatParent.clientHeight;
+    }
   }
 
-  // Watch for changes that might affect height
-  const observer = new MutationObserver(updateMargin);
-  observer.observe(chatInputRow, {
-    childList: true,
-    subtree: true,
-    attributes: true
-  });
+  // Watch for size changes that affect height
+  new ResizeObserver(updateMargin).observe(chatInputRow);
 
   // Also listen for window resize
   window.addEventListener("resize", updateMargin);

From 5f1707af3562c1854068af6669d055bdf02cf038 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 20:38:40 -0700
Subject: [PATCH 082/210] UI: Increase the width of non-instruct chat styles

---
 css/chat_style-Dark.css            | 1 +
 css/chat_style-TheEncrypted777.css | 1 +
 css/chat_style-cai-chat-square.css | 1 +
 css/chat_style-cai-chat.css        | 1 +
 css/chat_style-messenger.css       | 1 +
 css/chat_style-wpp.css             | 1 +
 css/main.css                       | 1 -
 7 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css
index 6a4784cc..28d77a4b 100644
--- a/css/chat_style-Dark.css
+++ b/css/chat_style-Dark.css
@@ -2,6 +2,7 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
+    width: min(100%, calc(768px + 60px));
     padding-bottom: 22px;
     padding-top: 6px;
     font-size: 18px;
diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css
index fbd47072..21156ee5 100644
--- a/css/chat_style-TheEncrypted777.css
+++ b/css/chat_style-TheEncrypted777.css
@@ -4,6 +4,7 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
+    width: min(100%, calc(768px + 60px + 90px));
     padding-bottom: 21px;
     padding-top: 7px;
     font-size: 18px;
diff --git a/css/chat_style-cai-chat-square.css b/css/chat_style-cai-chat-square.css
index 291a1209..0d9467df 100644
--- a/css/chat_style-cai-chat-square.css
+++ b/css/chat_style-cai-chat-square.css
@@ -19,4 +19,5 @@
     padding-bottom: 1.5em;
     padding-top: 0.5em;
     grid-template-columns: 70px minmax(0, 1fr);
+    width: min(100%, calc(768px + 70px));
 }
diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
index b06b1269..6de32597 100644
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@@ -2,6 +2,7 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
+    width: min(100%, calc(768px + 60px));
     padding-bottom: 1.5em;
     padding-top: 0.5em;
     font-size: 15px;
diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css
index 70fd6d4a..85178759 100644
--- a/css/chat_style-messenger.css
+++ b/css/chat_style-messenger.css
@@ -1,4 +1,5 @@
 .message {
+    width: min(100%, calc(48rem + 60px));
     padding-bottom: 22px;
     padding-top: 3px;
     font-size: 15px;
diff --git a/css/chat_style-wpp.css b/css/chat_style-wpp.css
index b2ac4d2a..5c14fa80 100644
--- a/css/chat_style-wpp.css
+++ b/css/chat_style-wpp.css
@@ -1,5 +1,6 @@
 .message {
     display: block;
+    width: min(100%, 48rem);
     padding-top: 0;
     padding-bottom: 21px;
     font-size: 15px;
diff --git a/css/main.css b/css/main.css
index 7cc496a7..5a58c4a3 100644
--- a/css/main.css
+++ b/css/main.css
@@ -400,7 +400,6 @@ audio {
 }
 
 .chat .message {
-    width: min(100%, 48rem);
     margin-left: auto;
     margin-right: auto;
     text-align: start;

From 998b9bfb2a6df1e0bd37bd4dfcdbd10bf4a38977 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 13 Mar 2026 21:05:31 -0700
Subject: [PATCH 083/210] UI: Make all chat styles better match instruct style

---
 css/chat_style-Dark.css            |  5 +----
 css/chat_style-TheEncrypted777.css |  6 ++----
 css/chat_style-cai-chat.css        |  8 +-------
 css/chat_style-messenger.css       | 13 +++++--------
 css/chat_style-wpp.css             | 11 +----------
 css/main.css                       | 22 +++++++++++++++++++---
 6 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css
index 28d77a4b..02beb935 100644
--- a/css/chat_style-Dark.css
+++ b/css/chat_style-Dark.css
@@ -92,9 +92,6 @@
 }
 
 .message-body p {
-    margin-bottom: 0 !important;
-    font-size: 16px !important;
-    line-height: 1.5 !important;
     color: #e0e0e0 !important; /* Light color for text */
 }
 
@@ -123,7 +120,7 @@
     }
 
     .message-body p {
-        font-size: 14px !important; /* Smaller text for mobile */
+        font-size: 14px !important;
     }
 
     .username {
diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css
index 21156ee5..b3df6710 100644
--- a/css/chat_style-TheEncrypted777.css
+++ b/css/chat_style-TheEncrypted777.css
@@ -87,10 +87,8 @@
     border-radius: 20px;
 }
 
-.message-body p {
-    margin-bottom: 0 !important;
+.message-body p, .message-body li {
     font-size: 18px !important;
-    line-height: 1.428571429 !important;
     color: rgb(243 244 246) !important;
     text-shadow: 2px 2px 2px rgb(0 0 0);
     font-weight: 500;
@@ -128,7 +126,7 @@
         padding-left: 0;
     }
 
-    .message-body p {
+    .message-body p, .message-body li {
         font-size: 16px !important;
     }
 
diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
index 6de32597..9cc4d4cd 100644
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@@ -47,16 +47,10 @@
     border-radius: 20px;
 }
 
-.message-body p {
-    font-size: 15px !important;
-    line-height: 22.5px !important;
+.message-body p, .message-body li {
     font-weight: 500;
 }
 
-.message-body p, .chat .message-body ul, .chat .message-body ol {
-    margin-bottom: 10px !important;
-}
-
 .dark .message-body p em {
     color: rgb(138 138 138) !important;
 }
diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css
index 85178759..438b8060 100644
--- a/css/chat_style-messenger.css
+++ b/css/chat_style-messenger.css
@@ -61,8 +61,10 @@
     text-align: right;
 }
 
-.dark .circle-bot + .text div, .dark .circle-bot + .text * {
-    color: #000;
+.dark .circle-bot + .text div, .dark .circle-bot + .text *,
+.dark .chat .message .circle-bot + .text .message-body :is(h1, h2, h3, h4, h5, h6),
+.dark .chat .message .circle-bot + .text .message-body a {
+    color: #000 !important;
 }
 
 .text {
@@ -77,19 +79,14 @@
     font-weight: bold;
 }
 
-.message-body {
-}
-
 .message-body img {
     max-width: 300px;
     max-height: 300px;
     border-radius: 20px;
 }
 
-.message-body p {
-    margin-bottom: 0 !important;
+.message-body p, .message-body li {
     font-size: 15px !important;
-    line-height: 1.428571429 !important;
     font-weight: 500;
 }
 
diff --git a/css/chat_style-wpp.css b/css/chat_style-wpp.css
index 5c14fa80..ad6985d2 100644
--- a/css/chat_style-wpp.css
+++ b/css/chat_style-wpp.css
@@ -78,14 +78,8 @@
     border-radius: 12px;
 }
 
-.message-body p {
+.message-body p, .message-body li {
     font-size: 15px !important;
-    line-height: 1.4 !important;
-    font-weight: 400;
-}
-
-.message-body p:first-child {
-    margin-top: 0 !important;
 }
 
 .dark .message-body p em {
@@ -101,6 +95,3 @@
     margin-top: 8px;
 }
 
-.message-body p, .chat .message-body ul, .chat .message-body ol {
-    margin-bottom: 10px !important;
-}
diff --git a/css/main.css b/css/main.css
index 5a58c4a3..49b8f752 100644
--- a/css/main.css
+++ b/css/main.css
@@ -430,7 +430,12 @@ audio {
     font-size: 16px;
 }
 
-.dark .message-body :is(h1, h2, h3, h4, h5, h6) {
+.dark .message-body h1,
+.dark .message-body h2,
+.dark .message-body h3,
+.dark .message-body h4,
+.dark .message-body h5,
+.dark .message-body h6 {
     color: white !important;
 }
 
@@ -830,9 +835,20 @@ audio {
     }
 }
 
-.message-body ol, .message-body ul {
+.message-body p, .message-body li {
+    line-height: 1.75 !important;
+}
+
+.message-body p, .message-body ul, .message-body ol {
+    margin: 1.25em 0 !important;
+}
+
+.message-body :is(p, ul, ol):first-child {
     margin-top: 0 !important;
-    margin-bottom: 1.25em !important;
+}
+
+.message-body :is(p, ul, ol):last-child {
+    margin-bottom: 0 !important;
 }
 
 /* ----------------------------------------------

From accb2ef661838d95521c5fc7fd7660bf541a5064 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 06:16:09 -0700
Subject: [PATCH 084/210] UI/API: Prevent tool call markup from leaking into
 streamed UI output (closes #7427)

---
 modules/chat.py         | 11 +++++++++
 modules/tool_parsing.py | 49 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index 02ae46e4..daecd50b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1028,6 +1028,13 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
                 thinking_prefix = start_tag
                 break
 
+    # When tools are active, buffer streaming output during potential tool
+    # call generation to prevent raw markup from leaking into the display.
+    _check_tool_markers = bool(state.get('tools'))
+    if _check_tool_markers:
+        from modules.tool_parsing import streaming_tool_buffer_check
+        _tool_names = [t['function']['name'] for t in state['tools'] if 'function' in t and 'name' in t['function']]
+
     # Generate
     reply = None
     for j, reply in enumerate(generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True, for_ui=for_ui)):
@@ -1077,6 +1084,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             })
 
         if is_stream:
+            if _check_tool_markers:
+                if streaming_tool_buffer_check(output['internal'][-1][1], _tool_names):
+                    continue
+
             yield output
 
     if _continue:
diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
index 460188d3..418503ad 100644
--- a/modules/tool_parsing.py
+++ b/modules/tool_parsing.py
@@ -9,6 +9,55 @@ def get_tool_call_id() -> str:
     return "call_" + "".join(b).lower()
 
 
+# Known opening markers for tool calls across model formats.
+# Used during streaming to buffer output that might be tool call markup,
+# preventing raw markup from leaking into displayed/streamed content.
+TOOL_CALL_OPENING_MARKERS = [
+    '<tool_call>',
+    '<function_call>',
+    '<minimax:tool_call>',
+    '<|tool_call_begin|>',
+    '<|tool_calls_section_begin|>',
+    '<｜tool▁call▁begin｜>',
+    '<｜tool▁calls▁begin｜>',
+    '[TOOL_CALLS]',
+    'to=functions.',
+    '<|channel|>commentary',
+]
+
+def streaming_tool_buffer_check(text, tool_names=None):
+    '''
+    Check whether streaming output should be withheld because it may
+    contain tool-call markup.
+    '''
+    # Full marker found → buffer permanently
+    for marker in TOOL_CALL_OPENING_MARKERS:
+        if marker in text:
+            return True
+
+    # Bare function-name style (e.g. Devstral): "get_weather{...}"
+    # Only match tool name followed by '{' to avoid false positives on
+    # common words that happen to be tool names (e.g. "get", "search").
+    if tool_names:
+        for name in tool_names:
+            if name + '{' in text or name + ' {' in text:
+                return True
+            # Partial: text ends with tool name (or prefix of it) but '{' hasn't arrived yet
+            if text.endswith(name):
+                return True
+            for prefix_len in range(min(len(name) - 1, len(text)), 0, -1):
+                if text.endswith(name[:prefix_len]):
+                    return True
+
+    # Tail might be a partial marker forming across tokens
+    for marker in TOOL_CALL_OPENING_MARKERS:
+        for prefix_len in range(min(len(marker) - 1, len(text)), 0, -1):
+            if text.endswith(marker[:prefix_len]):
+                return True
+
+    return False
+
+
 def check_and_sanitize_tool_call_candidate(candidate_dict: dict, tool_names: list[str]):
     # check if property 'function' exists and is a dictionary, otherwise adapt dict
     if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str):

From 09a6549816eca117d779fdf66bad9862d559f288 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 06:52:40 -0700
Subject: [PATCH 085/210] API: Stream reasoning_content separately from content
 in OpenAI-compatible responses

---
 extensions/openai/completions.py | 29 +++++++++++++++++++++++------
 modules/reasoning.py             | 16 ++++++++++++++--
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 27defe42..51427050 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -417,7 +417,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         logprob_proc.token_alternatives_history.clear()
     chat_logprobs_offset = [0]  # mutable for closure access in streaming
 
-    def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False):
+    def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False, reasoning_content=None):
         # begin streaming
         delta = {}
         if include_role:
@@ -425,6 +425,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             delta['refusal'] = None
         if content is not None:
             delta['content'] = content
+        if reasoning_content is not None:
+            delta['reasoning_content'] = reasoning_content
         if chunk_tool_calls:
             delta['tool_calls'] = chunk_tool_calls
 
@@ -477,6 +479,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
 
     answer = ''
     seen_content = ''
+    seen_reasoning = ''
 
     tool_calls = []
     end_last_tool_call = 0
@@ -508,17 +511,31 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             break
 
         if stream:
-            len_seen = len(seen_content)
-            new_content = answer[len_seen:]
+            # Strip reasoning/thinking blocks so only final content is streamed.
+            # Reasoning is emitted separately as reasoning_content deltas.
+            reasoning, content = extract_reasoning(answer)
+            if reasoning is not None:
+                new_reasoning = reasoning[len(seen_reasoning):]
+                new_content = content[len(seen_content):]
+            else:
+                new_reasoning = None
+                new_content = answer[len(seen_content):]
 
-            if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
+            if (not new_content and not new_reasoning) or chr(0xfffd) in (new_content or '') + (new_reasoning or ''):
                 continue
 
-            chunk = chat_streaming_chunk(new_content)
+            chunk = chat_streaming_chunk(
+                content=new_content if new_content else None,
+                reasoning_content=new_reasoning if new_reasoning else None,
+            )
             if include_usage:
                 chunk['usage'] = None
 
-            seen_content = answer
+            if reasoning is not None:
+                seen_reasoning = reasoning
+                seen_content = content
+            else:
+                seen_content = answer
             yield chunk
 
     token_count = shared.model.last_prompt_token_count if hasattr(shared.model, 'last_prompt_token_count') else 0
diff --git a/modules/reasoning.py b/modules/reasoning.py
index 708ee55a..3a9ab546 100644
--- a/modules/reasoning.py
+++ b/modules/reasoning.py
@@ -8,7 +8,7 @@ THINKING_FORMATS = [
     ('<|channel|>commentary<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'),
     ('<seed:think>', '</seed:think>', None),
     ('<|think|>', '<|end|>', '<|content|>'),  # Solar Open
-    ('Thinking Process:', '</think>', None),  # Qwen3.5 verbose thinking outside tags
+    # ('Thinking Process:', '</think>', None),  # Qwen3.5 verbose thinking outside tags -- removed: too prone to false positives in streaming
     (None, '</think>', None),  # End-only variant (e.g., Qwen3-next)
 ]
 
@@ -42,6 +42,12 @@ def extract_reasoning(text, html_escaped=False):
             start_esc = esc(start_tag)
             start_pos = text.find(start_esc)
             if start_pos == -1:
+                # During streaming, the start tag may be arriving partially.
+                # If the text is a prefix of a start tag, return empty content
+                # to prevent the partial tag from leaking.
+                stripped = text.strip()
+                if stripped and start_esc.startswith(stripped):
+                    return '', ''
                 continue
             thought_start = start_pos + len(start_esc)
             end_pos = text.find(end_esc, thought_start)
@@ -63,7 +69,13 @@ def extract_reasoning(text, html_escaped=False):
             thought_end = end_pos
             if content_esc:
                 content_pos = text.find(content_esc, end_pos)
-                content_start = content_pos + len(content_esc) if content_pos != -1 else end_pos + len(end_esc)
+                if content_pos != -1:
+                    content_start = content_pos + len(content_esc)
+                else:
+                    # Content tag expected but not yet present (e.g. partial
+                    # streaming) — suppress intermediate tags between end_tag
+                    # and content_tag so they don't leak as content.
+                    content_start = len(text)
             else:
                 content_start = end_pos + len(end_esc)
 

From cb08ba63dcd76228a4c070acd5352a3df4d78486 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 09:08:05 -0700
Subject: [PATCH 086/210] Fix GPT-OSS channel markup leaking into UI when model
 skips analysis block

---
 modules/chat.py                |  2 ++
 modules/reasoning.py           | 14 ++++++++++++--
 modules/ui_image_generation.py |  2 ++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index daecd50b..10785c19 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -882,6 +882,8 @@ def generate_search_query(user_message, state):
         query = query.rsplit("</think>", 1)[1]
     elif "<|start|>assistant<|channel|>final<|message|>" in query:
         query = query.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1]
+    elif "<|channel|>final<|message|>" in query:
+        query = query.rsplit("<|channel|>final<|message|>", 1)[1]
     elif "</seed:think>" in query:
         query = query.rsplit("</seed:think>", 1)[1]
 
diff --git a/modules/reasoning.py b/modules/reasoning.py
index 3a9ab546..bc61aab3 100644
--- a/modules/reasoning.py
+++ b/modules/reasoning.py
@@ -4,8 +4,8 @@ import html as html_module
 # Use None for start_tag to match from beginning (end-only formats should be listed last)
 THINKING_FORMATS = [
     ('<think>', '</think>', None),
-    ('<|channel|>analysis<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'),
-    ('<|channel|>commentary<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'),
+    ('<|channel|>analysis<|message|>', '<|end|>', '<|channel|>final<|message|>'),
+    ('<|channel|>commentary<|message|>', '<|end|>', '<|channel|>final<|message|>'),
     ('<seed:think>', '</seed:think>', None),
     ('<|think|>', '<|end|>', '<|content|>'),  # Solar Open
     # ('Thinking Process:', '</think>', None),  # Qwen3.5 verbose thinking outside tags -- removed: too prone to false positives in streaming
@@ -81,4 +81,14 @@ def extract_reasoning(text, html_escaped=False):
 
         return text[thought_start:thought_end], text[content_start:]
 
+    # Handle standalone GPT-OSS final channel marker without a preceding
+    # analysis/commentary block (the model skipped thinking entirely).
+    for marker in ['<|start|>assistant<|channel|>final<|message|>', '<|channel|>final<|message|>']:
+        marker_esc = esc(marker)
+        pos = text.find(marker_esc)
+        if pos != -1:
+            before = text[:pos].strip()
+            after = text[pos + len(marker_esc):]
+            return (before if before else None), after
+
     return None, text
diff --git a/modules/ui_image_generation.py b/modules/ui_image_generation.py
index e9df9bd3..dc108f6d 100644
--- a/modules/ui_image_generation.py
+++ b/modules/ui_image_generation.py
@@ -728,6 +728,8 @@ def generate_prompt_variation(state):
         variation = variation.rsplit("</think>", 1)[1]
     elif "<|start|>assistant<|channel|>final<|message|>" in variation:
         variation = variation.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1]
+    elif "<|channel|>final<|message|>" in variation:
+        variation = variation.rsplit("<|channel|>final<|message|>", 1)[1]
     elif "</seed:think>" in variation:
         variation = variation.rsplit("</seed:think>", 1)[1]
 

From 8bff331893e6ea1caf82b764e3d41514a04aa573 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 09:26:20 -0700
Subject: [PATCH 087/210] UI: Fix tool call markup flashing before accordion
 appears during streaming

---
 modules/chat.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index 10785c19..08f55539 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1033,6 +1033,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     # When tools are active, buffer streaming output during potential tool
     # call generation to prevent raw markup from leaking into the display.
     _check_tool_markers = bool(state.get('tools'))
+    _last_visible_before_tool_buffer = None
     if _check_tool_markers:
         from modules.tool_parsing import streaming_tool_buffer_check
         _tool_names = [t['function']['name'] for t in state['tools'] if 'function' in t and 'name' in t['function']]
@@ -1089,6 +1090,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             if _check_tool_markers:
                 if streaming_tool_buffer_check(output['internal'][-1][1], _tool_names):
                     continue
+                _last_visible_before_tool_buffer = output['visible'][-1][1]
 
             yield output
 
@@ -1122,6 +1124,13 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             'visible_content': output['visible'][row_idx][1]
         })
 
+    # When tool markers were detected during streaming, restore the last
+    # visible text from before buffering started so raw markup doesn't flash
+    # in the UI.  The internal text is left intact so the caller can still
+    # parse tool calls from it.
+    if is_stream and _check_tool_markers and streaming_tool_buffer_check(output['internal'][-1][1], _tool_names):
+        output['visible'][-1][1] = _last_visible_before_tool_buffer or ''
+
     yield output
 
 

From c908ac00d76d263c202a1b0cd2ed48c6f369f5e5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 09:29:17 -0700
Subject: [PATCH 088/210] Replace html2text with trafilatura for better web
 content extraction

After this change a lot of boilerplate is removed from web pages, saving tokens on agentic loops.
---
 modules/web_search.py                         | 21 ++++++++-----------
 requirements/full/requirements.txt            |  1 -
 requirements/full/requirements_amd.txt        |  1 -
 .../full/requirements_apple_intel.txt         |  1 -
 .../full/requirements_apple_silicon.txt       |  1 -
 requirements/full/requirements_cpu_only.txt   |  1 -
 requirements/full/requirements_nowheels.txt   |  1 -
 requirements/portable/requirements.txt        |  2 +-
 requirements/portable/requirements_amd.txt    |  2 +-
 .../portable/requirements_apple_intel.txt     |  2 +-
 .../portable/requirements_apple_silicon.txt   |  2 +-
 .../portable/requirements_cpu_only.txt        |  2 +-
 .../portable/requirements_cuda131.txt         |  2 +-
 .../portable/requirements_nowheels.txt        |  2 +-
 requirements/portable/requirements_vulkan.txt |  2 +-
 15 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/modules/web_search.py b/modules/web_search.py
index 216d7933..a4424ee3 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -42,9 +42,9 @@ def get_current_timestamp():
 
 def download_web_page(url, timeout=10, include_links=False):
     """
-    Download a web page and convert its HTML content to structured Markdown text.
+    Download a web page and extract its main content as Markdown text.
     """
-    import html2text
+    import trafilatura
 
     try:
         _validate_url(url)
@@ -62,16 +62,13 @@ def download_web_page(url, timeout=10, include_links=False):
 
         response.raise_for_status()
 
-        # Initialize the HTML to Markdown converter
-        h = html2text.HTML2Text()
-        h.body_width = 0
-        h.ignore_images = True
-        h.ignore_links = not include_links
-
-        # Convert the HTML to Markdown
-        markdown_text = h.handle(response.text)
-
-        return markdown_text
+        result = trafilatura.extract(
+            response.text,
+            include_links=include_links,
+            output_format='markdown',
+            url=url
+        )
+        return result or ""
     except requests.exceptions.RequestException as e:
         logger.error(f"Error downloading {url}: {e}")
         return ""
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 8a0802f7..e493d83d 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -6,7 +6,6 @@ diffusers==0.37.*
 einops
 fastapi==0.112.4
 flash-linear-attention==0.4.*
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 9b31d668..48cace33 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -4,7 +4,6 @@ datasets
 diffusers==0.37.*
 einops
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 138639e5..f9132f2e 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -4,7 +4,6 @@ datasets
 diffusers==0.37.*
 einops
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index f3ebd171..e4b2882d 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -4,7 +4,6 @@ datasets
 diffusers==0.37.*
 einops
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index e32a2ed1..1b42737b 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -4,7 +4,6 @@ datasets
 diffusers==0.37.*
 einops
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 6128c0ed..ea9ad2c7 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -4,7 +4,6 @@ datasets
 diffusers==0.37.*
 einops
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 93eb3b85..0471cc73 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -1,6 +1,5 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
@@ -11,6 +10,7 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
+trafilatura==2.0.0
 tqdm
 
 # Gradio
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 36e0e4d9..dfefce20 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -1,6 +1,5 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
@@ -11,6 +10,7 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
+trafilatura==2.0.0
 tqdm
 
 # Gradio
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 495bd5fa..5c032e6b 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -1,6 +1,5 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
@@ -11,6 +10,7 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
+trafilatura==2.0.0
 tqdm
 
 # Gradio
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 7e82f68d..385ecedf 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -1,6 +1,5 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
@@ -11,6 +10,7 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
+trafilatura==2.0.0
 tqdm
 
 # Gradio
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 046619e1..d8f7d494 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -1,6 +1,5 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
@@ -11,6 +10,7 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
+trafilatura==2.0.0
 tqdm
 
 # Gradio
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 590562f8..adc6a065 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -1,6 +1,5 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
@@ -11,6 +10,7 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
+trafilatura==2.0.0
 tqdm
 
 # Gradio
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 8c3e2aac..942f7a2a 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -1,6 +1,5 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
@@ -11,6 +10,7 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
+trafilatura==2.0.0
 tqdm
 
 # Gradio
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index bf80deb0..fca722fd 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -1,6 +1,5 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
-html2text==2025.4.15
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
@@ -11,6 +10,7 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
+trafilatura==2.0.0
 tqdm
 
 # Gradio

From c7953fb92319d426f09045f4bf34f701adfeec5c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 09:44:37 -0700
Subject: [PATCH 089/210] Add ROCm version to portable package filenames

---
 .github/workflows/build-portable-release-rocm.yml | 4 ++--
 requirements/full/requirements.txt                | 1 +
 requirements/full/requirements_amd.txt            | 1 +
 requirements/full/requirements_apple_intel.txt    | 1 +
 requirements/full/requirements_apple_silicon.txt  | 1 +
 requirements/full/requirements_cpu_only.txt       | 1 +
 requirements/full/requirements_nowheels.txt       | 1 +
 7 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-portable-release-rocm.yml b/.github/workflows/build-portable-release-rocm.yml
index 6f9ea4ec..1050fa7e 100644
--- a/.github/workflows/build-portable-release-rocm.yml
+++ b/.github/workflows/build-portable-release-rocm.yml
@@ -148,11 +148,11 @@ jobs:
             # 6. Create archive
             cd ..
             if [[ "$RUNNER_OS" == "Windows" ]]; then
-                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-rocm.zip"
+                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-rocm7.2.zip"
                 echo "Creating archive: $ARCHIVE_NAME"
                 powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
             else
-                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-rocm.tar.gz"
+                ARCHIVE_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-rocm7.2.tar.gz"
                 echo "Creating archive: $ARCHIVE_NAME"
                 tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
             fi
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index e493d83d..dca686d9 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -24,6 +24,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
+trafilatura==2.0.0
 transformers==5.3.*
 triton-windows==3.5.1.post24; platform_system == "Windows"
 tqdm
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 48cace33..37cbf729 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -24,6 +24,7 @@ tensorboard
 torchao==0.15.*
 transformers==5.3.*
 tqdm
+trafilatura==2.0.0
 wandb
 
 # Gradio
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index f9132f2e..fed46240 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -24,6 +24,7 @@ tensorboard
 torchao==0.15.*
 transformers==5.3.*
 tqdm
+trafilatura==2.0.0
 wandb
 
 # Gradio
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index e4b2882d..fac36437 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -24,6 +24,7 @@ tensorboard
 torchao==0.15.*
 transformers==5.3.*
 tqdm
+trafilatura==2.0.0
 wandb
 
 # Gradio
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 1b42737b..c86caf37 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -24,6 +24,7 @@ tensorboard
 torchao==0.15.*
 transformers==5.3.*
 tqdm
+trafilatura==2.0.0
 wandb
 
 # Gradio
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index ea9ad2c7..4f5891da 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -24,6 +24,7 @@ tensorboard
 torchao==0.15.*
 transformers==5.3.*
 tqdm
+trafilatura==2.0.0
 wandb
 
 # Gradio

From d0a4993cf483eaf0a4a4d96b5418c8da17dc23b4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 09:53:12 -0700
Subject: [PATCH 090/210] UI: Increase ctx-size slider maximum to 1M and step
 to 1024

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index d17f586b..6ab19b7c 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -42,7 +42,7 @@ def create_ui():
                     with gr.Row():
                         with gr.Column():
                             shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=-1, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Number of layers to offload to the GPU. -1 = auto.')
-                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=0, maximum=262144, step=256, value=shared.args.ctx_size, info='Context length. llama.cpp: 0 = auto if gpu-layers is also -1. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
+                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=0, maximum=1048576, step=1024, value=shared.args.ctx_size, info='Context length. llama.cpp: 0 = auto if gpu-layers is also -1. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')

From 573617157ae6cf75393ee4aae118ac7e3607bc7f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 12:09:41 -0700
Subject: [PATCH 091/210] Optimize tool call detection

Avoids templates that don't contain a given necessary keyword
---
 extensions/openai/completions.py |   9 +-
 modules/chat.py                  |  17 ++-
 modules/tool_parsing.py          | 187 +++++++++++++++++++++----------
 3 files changed, 144 insertions(+), 69 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 51427050..fc17a19a 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -12,7 +12,7 @@ from pydantic import ValidationError
 from extensions.openai.errors import InvalidRequestError
 from extensions.openai.typing import ToolDefinition
 from extensions.openai.utils import debug_msg
-from modules.tool_parsing import get_tool_call_id, parse_tool_call
+from modules.tool_parsing import get_tool_call_id, parse_tool_call, detect_tool_call_format
 from modules import shared
 from modules.reasoning import extract_reasoning
 from modules.chat import (
@@ -484,6 +484,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     tool_calls = []
     end_last_tool_call = 0
     supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None
+    _tool_parsers = None
 
     # Filter supported_tools when tool_choice specifies a particular function
     if supported_tools and isinstance(tool_choice, dict):
@@ -491,11 +492,15 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         if specified_func and specified_func in supported_tools:
             supported_tools = [specified_func]
 
+    if supported_tools is not None:
+        _template_str = generate_params.get('instruction_template_str', '') if generate_params.get('mode') == 'instruct' else generate_params.get('chat_template_str', '')
+        _tool_parsers, _, _ = detect_tool_call_format(_template_str)
+
     for a in generator:
         answer = a['internal'][-1][1]
 
         if supported_tools is not None:
-            tool_call = parse_tool_call(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else []
+            tool_call = parse_tool_call(answer[end_last_tool_call:], supported_tools, parsers=_tool_parsers) if len(answer) > 0 else []
             if len(tool_call) > 0:
                 for tc in tool_call:
                     tc["id"] = get_tool_call_id()
diff --git a/modules/chat.py b/modules/chat.py
index 08f55539..1ffbb56b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1035,8 +1035,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     _check_tool_markers = bool(state.get('tools'))
     _last_visible_before_tool_buffer = None
     if _check_tool_markers:
-        from modules.tool_parsing import streaming_tool_buffer_check
+        from modules.tool_parsing import streaming_tool_buffer_check, detect_tool_call_format
         _tool_names = [t['function']['name'] for t in state['tools'] if 'function' in t and 'name' in t['function']]
+        _template_str = state.get('instruction_template_str', '') if state.get('mode') == 'instruct' else state.get('chat_template_str', '')
+        _, _streaming_markers, _check_bare_names = detect_tool_call_format(_template_str)
 
     # Generate
     reply = None
@@ -1088,7 +1090,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
         if is_stream:
             if _check_tool_markers:
-                if streaming_tool_buffer_check(output['internal'][-1][1], _tool_names):
+                if streaming_tool_buffer_check(output['internal'][-1][1], markers=_streaming_markers, tool_names=_tool_names, check_bare_names=_check_bare_names):
                     continue
                 _last_visible_before_tool_buffer = output['visible'][-1][1]
 
@@ -1128,7 +1130,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     # visible text from before buffering started so raw markup doesn't flash
     # in the UI.  The internal text is left intact so the caller can still
     # parse tool calls from it.
-    if is_stream and _check_tool_markers and streaming_tool_buffer_check(output['internal'][-1][1], _tool_names):
+    if is_stream and _check_tool_markers and streaming_tool_buffer_check(output['internal'][-1][1], markers=_streaming_markers, tool_names=_tool_names, check_bare_names=_check_bare_names):
         output['visible'][-1][1] = _last_visible_before_tool_buffer or ''
 
     yield output
@@ -1210,14 +1212,17 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
     # Load tools if any are selected
     selected = state.get('selected_tools', [])
     parse_tool_call = None
+    _tool_parsers = None
     if selected:
         from modules.tool_use import load_tools, execute_tool
-        from modules.tool_parsing import parse_tool_call, get_tool_call_id
+        from modules.tool_parsing import parse_tool_call, get_tool_call_id, detect_tool_call_format
 
     if selected:
         tool_defs, tool_executors = load_tools(selected)
         state['tools'] = tool_defs
         tool_func_names = [t['function']['name'] for t in tool_defs]
+        _template_str = state.get('instruction_template_str', '') if state.get('mode') == 'instruct' else state.get('chat_template_str', '')
+        _tool_parsers, _, _ = detect_tool_call_format(_template_str)
     else:
         tool_func_names = None
 
@@ -1272,7 +1277,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
                 last_save_time = current_time
 
             # Early stop on tool call detection
-            if tool_func_names and parse_tool_call(history['internal'][-1][1], tool_func_names):
+            if tool_func_names and parse_tool_call(history['internal'][-1][1], tool_func_names, parsers=_tool_parsers):
                 break
 
         # Save the model's visible output before re-applying visible_prefix,
@@ -1304,7 +1309,7 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
             break
 
         answer = history['internal'][-1][1]
-        parsed_calls, content_prefix = parse_tool_call(answer, tool_func_names, return_prefix=True) if answer else (None, '')
+        parsed_calls, content_prefix = parse_tool_call(answer, tool_func_names, return_prefix=True, parsers=_tool_parsers) if answer else (None, '')
 
         if not parsed_calls:
             break  # No tool calls — done
diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
index 418503ad..0454e901 100644
--- a/modules/tool_parsing.py
+++ b/modules/tool_parsing.py
@@ -9,9 +9,7 @@ def get_tool_call_id() -> str:
     return "call_" + "".join(b).lower()
 
 
-# Known opening markers for tool calls across model formats.
-# Used during streaming to buffer output that might be tool call markup,
-# preventing raw markup from leaking into displayed/streamed content.
+# All known opening markers for tool calls across model formats.
 TOOL_CALL_OPENING_MARKERS = [
     '<tool_call>',
     '<function_call>',
@@ -25,36 +23,47 @@ TOOL_CALL_OPENING_MARKERS = [
     '<|channel|>commentary',
 ]
 
-def streaming_tool_buffer_check(text, tool_names=None):
+
+def streaming_tool_buffer_check(text, markers=None, tool_names=None, check_bare_names=False):
     '''
     Check whether streaming output should be withheld because it may
     contain tool-call markup.
+
+    Args:
+        text: Full accumulated internal text.
+        markers: Template-specific markers for partial-prefix matching.
+                 If None, falls back to TOOL_CALL_OPENING_MARKERS.
+        tool_names: List of tool function names.
+        check_bare_names: Whether to do partial-prefix matching on tool
+                          names (for models with unknown template format).
     '''
-    # Full marker found → buffer permanently
+    # Full marker found in text → buffer permanently.
+    # Always checks ALL known markers regardless of template (cheap safety net).
     for marker in TOOL_CALL_OPENING_MARKERS:
         if marker in text:
             return True
 
-    # Bare function-name style (e.g. Devstral): "get_weather{...}"
-    # Only match tool name followed by '{' to avoid false positives on
-    # common words that happen to be tool names (e.g. "get", "search").
+    # Bare function-name full match: "get_weather{...}" or "get_weather {...}"
     if tool_names:
         for name in tool_names:
             if name + '{' in text or name + ' {' in text:
                 return True
-            # Partial: text ends with tool name (or prefix of it) but '{' hasn't arrived yet
+
+    # Partial-prefix matching: only for template-specific markers.
+    for marker in (markers if markers is not None else TOOL_CALL_OPENING_MARKERS):
+        for prefix_len in range(min(len(marker) - 1, len(text)), 0, -1):
+            if text.endswith(marker[:prefix_len]):
+                return True
+
+    # Bare-name partial matching: only when template format is unknown.
+    if check_bare_names and tool_names:
+        for name in tool_names:
             if text.endswith(name):
                 return True
             for prefix_len in range(min(len(name) - 1, len(text)), 0, -1):
                 if text.endswith(name[:prefix_len]):
                     return True
 
-    # Tail might be a partial marker forming across tokens
-    for marker in TOOL_CALL_OPENING_MARKERS:
-        for prefix_len in range(min(len(marker) - 1, len(text)), 0, -1):
-            if text.endswith(marker[:prefix_len]):
-                return True
-
     return False
 
 
@@ -488,7 +497,102 @@ def _parse_pythonic_tool_calls(answer: str, tool_names: list[str]):
     return matches, start_pos
 
 
-def parse_tool_call(answer: str, tool_names: list[str], return_prefix: bool = False):
+# Format registry: maps template substrings to the parser and streaming
+# markers for that format.  When a format's hints are NOT found in the
+# template, its parser and markers are excluded.
+TOOL_CALL_FORMATS = [
+    {
+        'template_hints': ['tool▁call▁begin', 'tool▁calls▁begin'],
+        'parser': _parse_deep_seek_tool_calls,
+        'markers': ['<｜tool▁call▁begin｜>', '<｜tool▁calls▁begin｜>'],
+    },
+    {
+        'template_hints': ['<|tool_call_begin|>', 'tool_calls_section'],
+        'parser': _parse_kimi_tool_calls,
+        'markers': ['<|tool_call_begin|>', '<|tool_calls_section_begin|>'],
+    },
+    {
+        'template_hints': ['to=functions.', '<|channel|>'],
+        'parser': _parse_channel_tool_calls,
+        'markers': ['to=functions.', '<|channel|>commentary'],
+    },
+    {
+        'template_hints': ['minimax:tool_call'],
+        'parser': _parse_minimax_tool_calls,
+        'markers': ['<minimax:tool_call>'],
+    },
+    {
+        'template_hints': ['<arg_key>'],
+        'parser': _parse_glm_tool_calls,
+        'markers': ['<tool_call>'],
+    },
+    {
+        'template_hints': ['<tool_call>'],
+        'parser': _parse_xml_param_tool_calls,
+        'markers': ['<tool_call>'],
+    },
+    {
+        'template_hints': ['[TOOL_CALLS]'],
+        'parser': _parse_mistral_token_tool_calls,
+        'markers': ['[TOOL_CALLS]'],
+    },
+    {
+        'template_hints': ['<function_call>'],
+        'parser': None,
+        'markers': ['<function_call>'],
+    },
+]
+
+# Default ordered list of all specialized parsers.
+ALL_PARSERS = [
+    _parse_deep_seek_tool_calls,
+    _parse_kimi_tool_calls,
+    _parse_channel_tool_calls,
+    _parse_minimax_tool_calls,
+    _parse_glm_tool_calls,
+    _parse_xml_param_tool_calls,
+    _parse_mistral_token_tool_calls,
+    _parse_bare_name_tool_calls,
+    _parse_pythonic_tool_calls,
+]
+
+
+def detect_tool_call_format(template_str):
+    """Inspect a chat/instruction template to determine which tool call
+    formats are relevant.
+
+    Uses an exclude-based approach: starts with all parsers/markers,
+    then removes the ones whose hints are not found in the template.
+
+    Returns (parsers, streaming_markers, check_bare_names).
+    """
+    if not template_str:
+        return None, TOOL_CALL_OPENING_MARKERS, True
+
+    matched_any = False
+    exclude_parsers = []
+    exclude_markers = []
+    matched_markers = []
+
+    for fmt in TOOL_CALL_FORMATS:
+        if any(hint in template_str for hint in fmt['template_hints']):
+            matched_any = True
+            matched_markers.extend(fmt['markers'])
+        else:
+            if fmt['parser'] is not None:
+                exclude_parsers.append(fmt['parser'])
+            exclude_markers.extend(fmt['markers'])
+
+    if not matched_any:
+        return None, TOOL_CALL_OPENING_MARKERS, True
+
+    parsers = [p for p in ALL_PARSERS if p not in exclude_parsers]
+    markers = [m for m in TOOL_CALL_OPENING_MARKERS if m not in exclude_markers or m in matched_markers]
+
+    return parsers, markers, False
+
+
+def parse_tool_call(answer: str, tool_names: list[str], return_prefix: bool = False, parsers: list = None):
     matches = []
     start_pos = None
 
@@ -498,52 +602,13 @@ def parse_tool_call(answer: str, tool_names: list[str], return_prefix: bool = Fa
             return matches, prefix
         return matches
 
-    # Check for DeepSeek-style tool calls (fullwidth Unicode token delimiters)
-    matches, start_pos = _parse_deep_seek_tool_calls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
+    # Try specialized parsers.
+    for parser in (parsers if parsers is not None else ALL_PARSERS):
+        matches, start_pos = parser(answer, tool_names)
+        if matches:
+            return _return(matches, start_pos)
 
-    # Check for Kimi-K2-style tool calls (pipe-delimited tokens)
-    matches, start_pos = _parse_kimi_tool_calls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for channel-based tool calls (e.g. GPT-OSS format)
-    matches, start_pos = _parse_channel_tool_calls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for MiniMax-style tool calls (invoke/parameter XML tags)
-    matches, start_pos = _parse_minimax_tool_calls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for GLM-style tool calls (arg_key/arg_value XML pairs)
-    matches, start_pos = _parse_glm_tool_calls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for XML-parameter style tool calls (e.g. Qwen3.5 format)
-    matches, start_pos = _parse_xml_param_tool_calls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for Mistral/Devstral-style tool calls ([TOOL_CALLS]name[ARGS]json)
-    matches, start_pos = _parse_mistral_token_tool_calls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for bare function-name style tool calls (e.g. Mistral format)
-    matches, start_pos = _parse_bare_name_tool_calls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Check for pythonic-style tool calls (e.g. Llama 4 format)
-    matches, start_pos = _parse_pythonic_tool_calls(answer, tool_names)
-    if matches:
-        return _return(matches, start_pos)
-
-    # Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
+    # Generic fallback: regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
     patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
 
     for pattern in patterns:

From beab346f48639b67c4b61584d2410c8d4685539e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 12:45:37 -0700
Subject: [PATCH 092/210] UI: Fix a minor glitch

---
 css/main.css | 1 -
 1 file changed, 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 49b8f752..d5e511de 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1388,7 +1388,6 @@ audio {
     overflow-wrap: break-word;
     max-height: 250px;
     overflow-y: scroll;
-    contain: layout;
 }
 
 .chat .message-body .thinking-content p,

From c09a367c6460a3fb5e44f2f5d4c24c34a7506715 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 14:01:34 -0700
Subject: [PATCH 093/210] UI: Fix dark theme using light theme syntax
 highlighting

---
 js/main.js | 6 ++++++
 server.py  | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/js/main.js b/js/main.js
index 5f79c3ec..394553d0 100644
--- a/js/main.js
+++ b/js/main.js
@@ -2,6 +2,12 @@
 // Main
 // ------------------------------------------------
 
+// Sync highlight.js theme with the actual Gradio theme
+var defined_hljs_css = document.body.classList.contains("dark") ? "file/css/highlightjs/github-dark.min.css" : "file/css/highlightjs/github.min.css";
+if (document.getElementById("highlight-css").getAttribute("href") !== defined_hljs_css) {
+  document.getElementById("highlight-css").setAttribute("href", defined_hljs_css);
+}
+
 let main_parent = document.getElementById("chat-tab").parentNode;
 let extensions = document.getElementById("extensions");
 
diff --git a/server.py b/server.py
index 340f7126..1aa9fc04 100644
--- a/server.py
+++ b/server.py
@@ -218,6 +218,10 @@ def create_interface():
 
         shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False)
 
+        # Sync theme_state with the actual client-side theme so that
+        # autosave always writes the correct dark_theme value.
+        shared.gradio['interface'].load(None, None, gradio('theme_state'), js='() => document.body.classList.contains("dark") ? "dark" : "light"')
+
         extensions_module.create_extensions_tabs()  # Extensions tabs
         extensions_module.create_extensions_block()  # Extensions block
 

From 9f657d39768da03180273dbb48e2af5424a29878 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 14:19:12 -0700
Subject: [PATCH 094/210] UI: Fix a minor glitch

---
 js/global_scope_js.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 425c2c59..084c98e8 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -287,7 +287,7 @@ function updateInstructPadding() {
     const messagesContainer = chatElement.querySelector(".messages");
     const lastChild = messagesContainer?.lastElementChild;
     const prevSibling = lastChild?.previousElementSibling;
-    if (lastChild && prevSibling) {
+    if (lastChild && prevSibling && chatElement.offsetHeight > 0) {
       let bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight);
       if (window.innerWidth <= 924) {
         bufferHeight = Math.max(0, bufferHeight - 32);

From 4ae2bd86e28fe67755e5329069a0d073d1162bae Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 15:30:01 -0700
Subject: [PATCH 095/210] Change the default ctx-size to 0 (auto) for llama.cpp

---
 modules/llama_cpp_server.py | 4 +++-
 modules/models.py           | 3 +++
 modules/shared.py           | 2 +-
 modules/ui_model_menu.py    | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 1425844d..fc8e9a19 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -371,6 +371,8 @@ class LlamaServer:
 
         if shared.args.ctx_size > 0:
             cmd += ["--ctx-size", str(shared.args.ctx_size)]
+        elif shared.args.gpu_layers >= 0:
+            cmd += ["--ctx-size", "8192"]
 
         if shared.args.gpu_layers >= 0:
             cmd += ["--gpu-layers", str(shared.args.gpu_layers), "--fit", "off"]
@@ -477,7 +479,7 @@ class LlamaServer:
             print()
 
         gpu_layers_str = "auto" if shared.args.gpu_layers < 0 else str(shared.args.gpu_layers)
-        ctx_size_str = "auto" if shared.args.ctx_size == 0 else str(shared.args.ctx_size)
+        ctx_size_str = "auto" if shared.args.ctx_size == 0 and shared.args.gpu_layers < 0 else str(shared.args.ctx_size or 8192)
         logger.info(f"Using gpu_layers={gpu_layers_str} | ctx_size={ctx_size_str} | cache_type={cache_type}")
         # Start the server with pipes for output
         self.process = subprocess.Popen(
diff --git a/modules/models.py b/modules/models.py
index d83b98d7..1d139b89 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -38,6 +38,9 @@ def load_model(model_name, loader=None):
         sampler_hijack.hijack_samplers()
 
     shared.args.loader = loader
+    if loader != 'llama.cpp' and shared.args.ctx_size == 0:
+        shared.args.ctx_size = 8192
+
     output = load_func_map[loader](model_name)
     if type(output) is tuple:
         model, tokenizer = output
diff --git a/modules/shared.py b/modules/shared.py
index 8c0aad9a..1cf365c6 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -76,7 +76,7 @@ group.add_argument('--loader', type=str, help='Choose the model loader manually,
 
 # Cache
 group = parser.add_argument_group('Context and cache')
-group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens. llama.cpp: 0 = auto if gpu-layers is also -1.')
+group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=0, metavar='N', help='Context size in tokens. 0 = auto for llama.cpp (requires gpu-layers=-1), 8192 for other loaders.')
 group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
 
 # Speculative decoding
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 6ab19b7c..b53bc292 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -42,7 +42,7 @@ def create_ui():
                     with gr.Row():
                         with gr.Column():
                             shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=-1, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Number of layers to offload to the GPU. -1 = auto.')
-                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=0, maximum=1048576, step=1024, value=shared.args.ctx_size, info='Context length. llama.cpp: 0 = auto if gpu-layers is also -1. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
+                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=0, maximum=1048576, step=1024, value=shared.args.ctx_size, info='Context length. 0 = auto for llama.cpp (requires gpu-layers=-1), 8192 for other loaders. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')

From e11425d5f81adab3ab99a5920daf46b6ed042ceb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 15:46:21 -0700
Subject: [PATCH 096/210] Fix relative redirect handling in web page fetcher

---
 modules/web_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/web_search.py b/modules/web_search.py
index a4424ee3..6d005496 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -7,7 +7,7 @@ import socket
 import urllib.request
 from concurrent.futures import as_completed
 from datetime import datetime
-from urllib.parse import quote_plus, urlparse
+from urllib.parse import quote_plus, urljoin, urlparse
 
 import requests
 
@@ -55,7 +55,7 @@ def download_web_page(url, timeout=10, include_links=False):
         for _ in range(max_redirects):
             response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=False)
             if response.is_redirect and 'Location' in response.headers:
-                url = response.headers['Location']
+                url = urljoin(url, response.headers['Location'])
                 _validate_url(url)
             else:
                 break

From 9eacd4a2073fa5e2556dee8e6c511e979fd2f8c4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 16:07:16 -0700
Subject: [PATCH 097/210] UI: Minor morphdom optimizations

---
 js/global_scope_js.js | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 084c98e8..ba5abcb2 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -325,27 +325,21 @@ function applyMorphdomUpdate(data) {
 
   const queryScope = target_element;
 
-  // Track open blocks
+  // Track open blocks and store their scroll positions
   const openBlocks = new Set();
+  const scrollPositions = {};
   queryScope.querySelectorAll(".thinking-block").forEach(block => {
     const blockId = block.getAttribute("data-block-id");
-    // If block exists and is open, add to open set
     if (blockId && block.hasAttribute("open")) {
       openBlocks.add(blockId);
-    }
-  });
-
-  // Store scroll positions for any open blocks
-  const scrollPositions = {};
-  queryScope.querySelectorAll(".thinking-block[open]").forEach(block => {
-    const content = block.querySelector(".thinking-content");
-    const blockId = block.getAttribute("data-block-id");
-    if (content && blockId) {
-      const isAtBottom = Math.abs((content.scrollHeight - content.scrollTop) - content.clientHeight) < 5;
-      scrollPositions[blockId] = {
-        position: content.scrollTop,
-        isAtBottom: isAtBottom
-      };
+      const content = block.querySelector(".thinking-content");
+      if (content) {
+        const isAtBottom = Math.abs((content.scrollHeight - content.scrollTop) - content.clientHeight) < 5;
+        scrollPositions[blockId] = {
+          position: content.scrollTop,
+          isAtBottom: isAtBottom
+        };
+      }
     }
   });
 
@@ -355,8 +349,8 @@ function applyMorphdomUpdate(data) {
     {
       onBeforeElUpdated: function(fromEl, toEl) {
         // Preserve code highlighting
-        if (fromEl.tagName === "PRE" && fromEl.querySelector("code[data-highlighted]")) {
-          const fromCode = fromEl.querySelector("code");
+        if (fromEl.tagName === "PRE") {
+          const fromCode = fromEl.querySelector("code[data-highlighted]");
           const toCode = toEl.querySelector("code");
 
           if (fromCode && toCode && fromCode.textContent === toCode.textContent) {

From b9bdbd638e91e797718ffc048d187365d162e0e0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 18:18:33 -0700
Subject: [PATCH 098/210] Fix after 4ae2bd86e28fe67755e5329069a0d073d1162bae

---
 modules/models_settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 0e117176..25a35237 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -67,7 +67,7 @@ def get_model_metadata(model):
 
         for k in metadata:
             if k.endswith('.context_length'):
-                model_settings['ctx_size'] = min(metadata[k], 8192)
+                model_settings['ctx_size'] = 0
                 model_settings['truncation_length_info'] = metadata[k]
             elif k.endswith('rope.freq_base'):
                 model_settings['rope_freq_base'] = metadata[k]

From c12653006198ba1ba08d563160dfae81f457aa61 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 18:22:41 -0700
Subject: [PATCH 099/210] UI: Minor color change

---
 css/main.css | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/css/main.css b/css/main.css
index d5e511de..30dd28ab 100644
--- a/css/main.css
+++ b/css/main.css
@@ -439,6 +439,10 @@ audio {
     color: white !important;
 }
 
+.dark .message-body blockquote {
+    border-left-color: rgb(255 255 255 / 30%);
+}
+
 .message-body h1 {
     font-weight: 800;
     font-size: 2.25em;

From d1aba085613656c4eefaed223412cefdbe231905 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 18:35:44 -0700
Subject: [PATCH 100/210] UI: Set chat widths to 724px

---
 css/chat_style-Dark.css            | 2 +-
 css/chat_style-TheEncrypted777.css | 2 +-
 css/chat_style-cai-chat-square.css | 2 +-
 css/chat_style-cai-chat.css        | 2 +-
 css/chat_style-messenger.css       | 2 +-
 css/chat_style-wpp.css             | 2 +-
 css/html_instruct_style.css        | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css
index 02beb935..01a168ab 100644
--- a/css/chat_style-Dark.css
+++ b/css/chat_style-Dark.css
@@ -2,7 +2,7 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    width: min(100%, calc(768px + 60px));
+    width: min(100%, calc(724px + 60px));
     padding-bottom: 22px;
     padding-top: 6px;
     font-size: 18px;
diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css
index b3df6710..9543a3df 100644
--- a/css/chat_style-TheEncrypted777.css
+++ b/css/chat_style-TheEncrypted777.css
@@ -4,7 +4,7 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    width: min(100%, calc(768px + 60px + 90px));
+    width: min(100%, calc(724px + 60px + 90px));
     padding-bottom: 21px;
     padding-top: 7px;
     font-size: 18px;
diff --git a/css/chat_style-cai-chat-square.css b/css/chat_style-cai-chat-square.css
index 0d9467df..8254a4ec 100644
--- a/css/chat_style-cai-chat-square.css
+++ b/css/chat_style-cai-chat-square.css
@@ -19,5 +19,5 @@
     padding-bottom: 1.5em;
     padding-top: 0.5em;
     grid-template-columns: 70px minmax(0, 1fr);
-    width: min(100%, calc(768px + 70px));
+    width: min(100%, calc(724px + 70px));
 }
diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
index 9cc4d4cd..66d2816d 100644
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@@ -2,7 +2,7 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    width: min(100%, calc(768px + 60px));
+    width: min(100%, calc(724px + 60px));
     padding-bottom: 1.5em;
     padding-top: 0.5em;
     font-size: 15px;
diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css
index 438b8060..fd9b5b70 100644
--- a/css/chat_style-messenger.css
+++ b/css/chat_style-messenger.css
@@ -1,5 +1,5 @@
 .message {
-    width: min(100%, calc(48rem + 60px));
+    width: min(100%, calc(724px + 60px));
     padding-bottom: 22px;
     padding-top: 3px;
     font-size: 15px;
diff --git a/css/chat_style-wpp.css b/css/chat_style-wpp.css
index ad6985d2..65e253d9 100644
--- a/css/chat_style-wpp.css
+++ b/css/chat_style-wpp.css
@@ -1,6 +1,6 @@
 .message {
     display: block;
-    width: min(100%, 48rem);
+    width: min(100%, 724px);
     padding-top: 0;
     padding-bottom: 21px;
     font-size: 15px;
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index d4780350..458feafc 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -78,7 +78,7 @@
 
 .chat .user-message .text,
 .chat .assistant-message .text {
-    max-width: 768px;
+    max-width: 724px;
     margin-left: auto;
     margin-right: auto;
 }

From 9955e54a1f22562b3e1c772e509abede61ecb92c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 18:51:12 -0700
Subject: [PATCH 101/210] UI: Fix autoscroll not engaging when regenerating
 short chats

---
 js/main.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/main.js b/js/main.js
index 394553d0..23db59d5 100644
--- a/js/main.js
+++ b/js/main.js
@@ -157,7 +157,7 @@ let lastClientHeight = 0;
 
 targetElement.addEventListener("scroll", function() {
   let diff = targetElement.scrollHeight - targetElement.clientHeight;
-  let isAtBottomNow = Math.abs(targetElement.scrollTop - diff) <= 10 || diff == 0;
+  let isAtBottomNow = Math.abs(targetElement.scrollTop - diff) <= 10 || diff <= 0;
 
   // Add scrolling class to disable hover effects
   if (window.isScrolled || !isAtBottomNow) {

From 2d3a3794c91ebabf0c07e01b2558a5ac37162c24 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 19:15:22 -0700
Subject: [PATCH 102/210] Add a Top-P preset, make it the new default, clean up
 the built-in presets

---
 modules/shared.py                          | 6 +++---
 user_data/presets/Instruct.yaml            | 1 -
 user_data/presets/Qwen3 - No Thinking.yaml | 3 ---
 user_data/presets/Qwen3 - Thinking.yaml    | 3 ---
 user_data/presets/Top-P.yaml               | 1 +
 user_data/presets/min_p.yaml               | 1 -
 6 files changed, 4 insertions(+), 11 deletions(-)
 delete mode 100644 user_data/presets/Instruct.yaml
 delete mode 100644 user_data/presets/Qwen3 - No Thinking.yaml
 delete mode 100644 user_data/presets/Qwen3 - Thinking.yaml
 create mode 100644 user_data/presets/Top-P.yaml
 delete mode 100644 user_data/presets/min_p.yaml

diff --git a/modules/shared.py b/modules/shared.py
index 1cf365c6..475d57b7 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -265,7 +265,7 @@ settings = {
     'web_search_pages': 3,
     'selected_tools': [],
     'prompt-notebook': '',
-    'preset': 'Qwen3 - Thinking' if (user_data_dir / 'presets/Qwen3 - Thinking.yaml').exists() else None,
+    'preset': 'Top-P' if (user_data_dir / 'presets/Top-P.yaml').exists() else None,
     'max_new_tokens': 512,
     'max_new_tokens_min': 1,
     'max_new_tokens_max': 4096,
@@ -290,7 +290,7 @@ settings = {
     'include_past_attachments': True,
 
     # Generation parameters - Curve shape
-    'temperature': 0.6,
+    'temperature': neutral_samplers['temperature'],
     'dynatemp_low': neutral_samplers['dynatemp_low'],
     'dynatemp_high': neutral_samplers['dynatemp_high'],
     'dynatemp_exponent': neutral_samplers['dynatemp_exponent'],
@@ -300,7 +300,7 @@ settings = {
     # Generation parameters - Curve cutoff
     'min_p': neutral_samplers['min_p'],
     'top_p': 0.95,
-    'top_k': 20,
+    'top_k': neutral_samplers['top_k'],
     'typical_p': neutral_samplers['typical_p'],
     'xtc_threshold': neutral_samplers['xtc_threshold'],
     'xtc_probability': neutral_samplers['xtc_probability'],
diff --git a/user_data/presets/Instruct.yaml b/user_data/presets/Instruct.yaml
deleted file mode 100644
index 142fcd82..00000000
--- a/user_data/presets/Instruct.yaml
+++ /dev/null
@@ -1 +0,0 @@
-min_p: 0.2
diff --git a/user_data/presets/Qwen3 - No Thinking.yaml b/user_data/presets/Qwen3 - No Thinking.yaml
deleted file mode 100644
index b1c1e03c..00000000
--- a/user_data/presets/Qwen3 - No Thinking.yaml	
+++ /dev/null
@@ -1,3 +0,0 @@
-temperature: 0.7
-top_p: 0.8
-top_k: 20
diff --git a/user_data/presets/Qwen3 - Thinking.yaml b/user_data/presets/Qwen3 - Thinking.yaml
deleted file mode 100644
index cb2942f9..00000000
--- a/user_data/presets/Qwen3 - Thinking.yaml	
+++ /dev/null
@@ -1,3 +0,0 @@
-temperature: 0.6
-top_p: 0.95
-top_k: 20
diff --git a/user_data/presets/Top-P.yaml b/user_data/presets/Top-P.yaml
new file mode 100644
index 00000000..f39e148f
--- /dev/null
+++ b/user_data/presets/Top-P.yaml
@@ -0,0 +1 @@
+top_p: 0.95
diff --git a/user_data/presets/min_p.yaml b/user_data/presets/min_p.yaml
deleted file mode 100644
index b8ebc95f..00000000
--- a/user_data/presets/min_p.yaml
+++ /dev/null
@@ -1 +0,0 @@
-min_p: 0.05

From f0c16813ef11a8ed39db3586614c06239ef25807 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 19:35:12 -0700
Subject: [PATCH 103/210] Remove the rope scaling parameters

Now models have 131k+ context length. The parameters can still be
passed to llama.cpp through --extra-flags.
---
 README.md                      | 26 ++++++++++----------------
 docs/04 - Model Tab.md         |  3 ---
 modules/llama_cpp_server.py    |  4 ----
 modules/loaders.py             |  7 -------
 modules/models_settings.py     | 22 ----------------------
 modules/shared.py              |  6 ------
 modules/transformers_loader.py |  7 -------
 modules/ui_model_menu.py       |  3 ---
 8 files changed, 10 insertions(+), 68 deletions(-)

diff --git a/README.md b/README.md
index 9a8e0a86..f1527176 100644
--- a/README.md
+++ b/README.md
@@ -244,15 +244,14 @@ usage: server.py [-h] [--user-data-dir USER_DATA_DIR] [--multi-user] [--model MO
                  [--row-split] [--no-mmap] [--mlock] [--no-kv-offload] [--batch-size BATCH_SIZE] [--ubatch-size UBATCH_SIZE] [--threads THREADS] [--threads-batch THREADS_BATCH] [--numa]
                  [--parallel PARALLEL] [--fit-target FIT_TARGET] [--extra-flags EXTRA_FLAGS] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16]
                  [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE]
-                 [--quant_type QUANT_TYPE] [--gpu-split GPU_SPLIT] [--enable-tp] [--tp-backend TP_BACKEND] [--cfg-cache] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
-                 [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
-                 [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api]
-                 [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] [--temperature N]
-                 [--dynatemp-low N] [--dynatemp-high N] [--dynatemp-exponent N] [--smoothing-factor N] [--smoothing-curve N] [--min-p N] [--top-p N] [--top-k N] [--typical-p N] [--xtc-threshold N]
-                 [--xtc-probability N] [--epsilon-cutoff N] [--eta-cutoff N] [--tfs N] [--top-a N] [--top-n-sigma N] [--adaptive-target N] [--adaptive-decay N] [--dry-multiplier N]
-                 [--dry-allowed-length N] [--dry-base N] [--repetition-penalty N] [--frequency-penalty N] [--presence-penalty N] [--encoder-repetition-penalty N] [--no-repeat-ngram-size N]
-                 [--repetition-penalty-range N] [--penalty-alpha N] [--guidance-scale N] [--mirostat-mode N] [--mirostat-tau N] [--mirostat-eta N] [--do-sample | --no-do-sample]
-                 [--dynamic-temperature | --no-dynamic-temperature] [--temperature-last | --no-temperature-last] [--sampler-priority N] [--dry-sequence-breakers N]
+                 [--quant_type QUANT_TYPE] [--gpu-split GPU_SPLIT] [--enable-tp] [--tp-backend TP_BACKEND] [--cfg-cache] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share]
+                 [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors]
+                 [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4]
+                 [--nowebui] [--temperature N] [--dynatemp-low N] [--dynatemp-high N] [--dynatemp-exponent N] [--smoothing-factor N] [--smoothing-curve N] [--min-p N] [--top-p N] [--top-k N]
+                 [--typical-p N] [--xtc-threshold N] [--xtc-probability N] [--epsilon-cutoff N] [--eta-cutoff N] [--tfs N] [--top-a N] [--top-n-sigma N] [--adaptive-target N] [--adaptive-decay N]
+                 [--dry-multiplier N] [--dry-allowed-length N] [--dry-base N] [--repetition-penalty N] [--frequency-penalty N] [--presence-penalty N] [--encoder-repetition-penalty N]
+                 [--no-repeat-ngram-size N] [--repetition-penalty-range N] [--penalty-alpha N] [--guidance-scale N] [--mirostat-mode N] [--mirostat-tau N] [--mirostat-eta N]
+                 [--do-sample | --no-do-sample] [--dynamic-temperature | --no-dynamic-temperature] [--temperature-last | --no-temperature-last] [--sampler-priority N] [--dry-sequence-breakers N]
                  [--enable-thinking | --no-enable-thinking] [--reasoning-effort N] [--chat-template-file CHAT_TEMPLATE_FILE]
 
 Text Generation Web UI
@@ -262,7 +261,7 @@ options:
 
 Basic settings:
   --user-data-dir USER_DATA_DIR                        Path to the user data directory. Default: auto-detected.
-  --multi-user                                         Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.
+  --multi-user                                         Multi-user mode. Chat histories are not saved or automatically loaded. Best suited for small trusted teams.
   --model MODEL                                        Name of the model to load by default.
   --lora LORA [LORA ...]                               The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
   --model-dir MODEL_DIR                                Path to directory with all the models.
@@ -289,7 +288,7 @@ Model loader:
                                                        LLM.
 
 Context and cache:
-  --ctx-size, --n_ctx, --max_seq_len N                 Context size in tokens. llama.cpp: 0 = auto if gpu-layers is also -1.
+  --ctx-size, --n_ctx, --max_seq_len N                 Context size in tokens. 0 = auto for llama.cpp (requires gpu-layers=-1), 8192 for other loaders.
   --cache-type, --cache_type N                         KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).
 
 Speculative decoding:
@@ -350,11 +349,6 @@ ExLlamaV3:
   --tp-backend TP_BACKEND                              The backend for tensor parallelism. Valid options: native, nccl. Default: native.
   --cfg-cache                                          Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
 
-RoPE:
-  --alpha_value ALPHA_VALUE                            Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
-  --rope_freq_base ROPE_FREQ_BASE                      If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
-  --compress_pos_emb COMPRESS_POS_EMB                  Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
-
 Gradio:
   --listen                                             Make the web UI reachable from your local network.
   --listen-port LISTEN_PORT                            The listening port that the server will use.
diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md
index 4d5ae645..744970ac 100644
--- a/docs/04 - Model Tab.md	
+++ b/docs/04 - Model Tab.md	
@@ -41,9 +41,6 @@ Options:
 * **cpu_memory**: Maximum CPU memory in GiB to use for CPU offloading via the accelerate library. Whatever doesn't fit in the GPU or CPU will go to a disk cache if the "disk" checkbox is enabled.
 * **compute_dtype**: Used when "load_in_4bit" is checked. I recommend leaving the default value.
 * **quant_type**: Used when "load_in_4bit" is checked. I recommend leaving the default value.
-* **alpha_value**: Used to extend the context length of a model with a minor loss in quality. I have measured 1.75 to be optimal for 1.5x context, and 2.5 for 2x context. That is, with alpha = 2.5 you can make a model with 4096 context length go to 8192 context length.
-* **rope_freq_base**: Originally another way to write "alpha_value", it ended up becoming a necessary parameter for some models like CodeLlama, which was fine-tuned with this set to 1000000 and hence needs to be loaded with it set to 1000000 as well.
-* **compress_pos_emb**: The first and original context-length extension method, discovered by [kaiokendev](https://kaiokendev.github.io/til). When set to 2, the context length is doubled, 3 and it's tripled, etc. It should only be used for models that have been fine-tuned with this parameter set to different than 1. For models that have not been tuned to have greater context length, alpha_value will lead to a smaller accuracy loss.
 * **attn_implementation**: Choose the attention implementation. Valid options: `sdpa`, `eager`, `flash_attention_2`. The default (`sdpa`) works well in most cases; `flash_attention_2` may be useful for training.
 * **cpu**: Loads the model in CPU mode using Pytorch. The model will be loaded in 32-bit precision, so a lot of RAM will be used. CPU inference with transformers is older than llama.cpp and it works, but it's a lot slower. Note: this parameter has a different interpretation in the llama.cpp loader (see above).
 * **load_in_8bit**: Load the model in 8-bit precision using bitsandbytes. The 8-bit kernel in that library has been optimized for training and not inference, so load_in_8bit is slower than load_in_4bit (but more accurate).
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index fc8e9a19..05c07748 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -403,10 +403,6 @@ class LlamaServer:
         if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
             cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
             cache_type = shared.args.cache_type
-        if shared.args.compress_pos_emb != 1:
-            cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
-        if shared.args.rope_freq_base > 0:
-            cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
         if shared.args.mmproj not in [None, 'None']:
             path = Path(shared.args.mmproj)
             if not path.exists():
diff --git a/modules/loaders.py b/modules/loaders.py
index d2ebdbc3..c90f2ebb 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -15,8 +15,6 @@ loaders_and_params = OrderedDict({
         'tensor_split',
         'extra_flags',
         'streaming_llm',
-        'rope_freq_base',
-        'compress_pos_emb',
         'row_split',
         'no_kv_offload',
         'no_mmap',
@@ -41,8 +39,6 @@ loaders_and_params = OrderedDict({
     'Transformers': [
         'gpu_split',
         'cpu_memory',
-        'alpha_value',
-        'compress_pos_emb',
         'compute_dtype',
         'quant_type',
         'load_in_8bit',
@@ -320,9 +316,6 @@ def list_model_elements():
         'extra_flags',
         'streaming_llm',
         'gpu_split',
-        'alpha_value',
-        'rope_freq_base',
-        'compress_pos_emb',
         'compute_dtype',
         'quant_type',
         'load_in_8bit',
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 25a35237..f3c9a986 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -15,9 +15,6 @@ def get_fallback_settings():
     return {
         'bf16': False,
         'ctx_size': 8192,
-        'rope_freq_base': 0,
-        'compress_pos_emb': 1,
-        'alpha_value': 1,
         'truncation_length': shared.settings['truncation_length'],
         'truncation_length_info': shared.settings['truncation_length'],
         'skip_special_tokens': shared.settings['skip_special_tokens'],
@@ -69,12 +66,6 @@ def get_model_metadata(model):
             if k.endswith('.context_length'):
                 model_settings['ctx_size'] = 0
                 model_settings['truncation_length_info'] = metadata[k]
-            elif k.endswith('rope.freq_base'):
-                model_settings['rope_freq_base'] = metadata[k]
-            elif k.endswith('rope.scale_linear'):
-                model_settings['compress_pos_emb'] = metadata[k]
-            elif k.endswith('rope.scaling.factor'):
-                model_settings['compress_pos_emb'] = metadata[k]
             elif k.endswith('.block_count'):
                 model_settings['gpu_layers'] = -1
                 model_settings['max_gpu_layers'] = metadata[k] + 1
@@ -119,15 +110,6 @@ def get_model_metadata(model):
                 model_settings['ctx_size'] = min(value, 8192)
                 break
 
-            if 'rope_theta' in metadata:
-                model_settings['rope_freq_base'] = metadata['rope_theta']
-            elif 'attn_config' in metadata and 'rope_theta' in metadata['attn_config']:
-                model_settings['rope_freq_base'] = metadata['attn_config']['rope_theta']
-
-            if 'rope_scaling' in metadata and isinstance(metadata['rope_scaling'], dict) and all(key in metadata['rope_scaling'] for key in ('type', 'factor')):
-                if metadata['rope_scaling']['type'] == 'linear':
-                    model_settings['compress_pos_emb'] = metadata['rope_scaling']['factor']
-
             if 'torch_dtype' in metadata and metadata['torch_dtype'] == 'bfloat16':
                 model_settings['bf16'] = True
 
@@ -181,10 +163,6 @@ def get_model_metadata(model):
     if 'instruction_template' not in model_settings:
         model_settings['instruction_template'] = 'Alpaca'
 
-    # Ignore rope_freq_base if set to the default value
-    if 'rope_freq_base' in model_settings and model_settings['rope_freq_base'] == 10000:
-        model_settings.pop('rope_freq_base')
-
     # Apply user settings from user_data/models/config-user.yaml
     settings = shared.user_config
     for pat in settings:
diff --git a/modules/shared.py b/modules/shared.py
index 475d57b7..354f7589 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -139,12 +139,6 @@ group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enab
 group.add_argument('--tp-backend', type=str, default='native', help='The backend for tensor parallelism. Valid options: native, nccl. Default: native.')
 group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
 
-# RoPE
-group = parser.add_argument_group('RoPE')
-group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
-group.add_argument('--rope_freq_base', type=int, default=0, help='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).')
-group.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
-
 # Gradio
 group = parser.add_argument_group('Gradio')
 group.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py
index b9918764..63758ad7 100644
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@@ -136,8 +136,6 @@ def load_model_HF(model_name):
         shared.args.load_in_4bit,
         shared.args.disk,
         shared.args.cpu_memory is not None,
-        shared.args.compress_pos_emb > 1,
-        shared.args.alpha_value > 1,
     ])
 
     # Load the model without any special settings
@@ -200,11 +198,6 @@ def load_model_HF(model_name):
             if shared.args.disk:
                 params['offload_folder'] = str(Path(shared.args.disk_cache_dir))
 
-        if shared.args.compress_pos_emb > 1:
-            params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
-        elif shared.args.alpha_value > 1:
-            params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
-
         logger.info("TRANSFORMERS_PARAMS=")
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
         print()
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index b53bc292..08fdc83e 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -100,9 +100,6 @@ def create_ui():
                                 shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                                 shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
                                 shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
-                                shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
-                                shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
-                                shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
                                 shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
                                 shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
 

From 5763cab3c4055122d85974b1bb94ce8aa526ac72 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 07:12:52 -0700
Subject: [PATCH 104/210] Fix a crash loading the MiniMax-M2.5 jinja template

---
 modules/chat.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index 1ffbb56b..bcb548fd 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -12,6 +12,7 @@ from datetime import datetime
 from functools import partial
 from pathlib import Path
 
+import markupsafe
 import yaml
 from jinja2.ext import loopcontrols
 from jinja2.sandbox import ImmutableSandboxedEnvironment
@@ -79,6 +80,13 @@ jinja_env = ImmutableSandboxedEnvironment(
     lstrip_blocks=True,
     extensions=[loopcontrols]
 )
+
+
+def custom_tojson(value, indent=None, ensure_ascii=True):
+    return markupsafe.Markup(json.dumps(value, indent=indent, ensure_ascii=ensure_ascii))
+
+
+jinja_env.filters["tojson"] = custom_tojson
 jinja_env.globals["strftime_now"] = strftime_now
 
 

From 9119ce0680f3e03fb1e726b2942606ff37d01bd0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 09:22:38 -0700
Subject: [PATCH 105/210] llama.cpp: Use `--fit-ctx 8192` when `--fit on` is
 used

This sets the minimum acceptable context length, which by default is 4096.
---
 modules/llama_cpp_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 05c07748..c3a8d105 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -378,6 +378,7 @@ class LlamaServer:
             cmd += ["--gpu-layers", str(shared.args.gpu_layers), "--fit", "off"]
         else:
             cmd += ["--fit", "on"]
+            cmd += ["--fit-ctx", "8192"]
             if shared.args.fit_target:
                 cmd += ["--fit-target", shared.args.fit_target]
 

From 80d0c03bab226baf9058a7ae62db2a5f2e7eebca Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 09:29:25 -0700
Subject: [PATCH 106/210] llama.cpp: Change the default `--fit-target` from
 1024 to 512

---
 modules/shared.py        | 2 +-
 modules/ui_model_menu.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 354f7589..a82b2018 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -108,7 +108,7 @@ group.add_argument('--threads', type=int, default=0, help='Number of threads to
 group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
-group.add_argument('--fit-target', type=str, default='1024', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices. Default: 1024.')
+group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"')
 
 # Transformers/Accelerate
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 08fdc83e..5cf0155d 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -46,7 +46,7 @@ def create_ui():
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
-                            shared.gradio['fit_target'] = gr.Textbox(label='fit-target', value=shared.args.fit_target, info='Target VRAM margin per device for auto GPU layers (MiB). Comma-separated list for multiple devices. Default: 1024.')
+                            shared.gradio['fit_target'] = gr.Textbox(label='fit-target', value=shared.args.fit_target, info='Target VRAM margin per device for auto GPU layers (MiB). Comma-separated list for multiple devices.')
                             shared.gradio['tp_backend'] = gr.Dropdown(label="tp-backend", choices=['native', 'nccl'], value=shared.args.tp_backend, info='The backend for tensor parallelism.')
 
                         with gr.Column():

From bfea49b197f053428fa8eca0d9f692141f4db623 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 09:34:17 -0700
Subject: [PATCH 107/210] Move top_p and top_k higher up in the UI and CLI help

---
 modules/presets.py       | 4 ++--
 modules/shared.py        | 8 ++++----
 modules/ui_parameters.py | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/modules/presets.py b/modules/presets.py
index b53195ee..560e0b77 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -16,9 +16,10 @@ default_preset_values = {
     'dynatemp_exponent': 1,
     'smoothing_factor': 0,
     'smoothing_curve': 1,
-    'min_p': 0,
     'top_p': 1,
     'top_k': 0,
+    'min_p': 0,
+    'top_n_sigma': 0,
     'typical_p': 1,
     'xtc_threshold': 0.1,
     'xtc_probability': 0,
@@ -26,7 +27,6 @@ default_preset_values = {
     'eta_cutoff': 0,
     'tfs': 1,
     'top_a': 0,
-    'top_n_sigma': 0,
     'adaptive_target': 0,
     'adaptive_decay': 0.9,
     'dry_multiplier': 0,
diff --git a/modules/shared.py b/modules/shared.py
index a82b2018..329114bb 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -175,9 +175,10 @@ group.add_argument('--dynatemp-high', type=float, default=_d['dynatemp_high'], m
 group.add_argument('--dynatemp-exponent', type=float, default=_d['dynatemp_exponent'], metavar='N', help='Dynamic temperature exponent')
 group.add_argument('--smoothing-factor', type=float, default=_d['smoothing_factor'], metavar='N', help='Smoothing factor')
 group.add_argument('--smoothing-curve', type=float, default=_d['smoothing_curve'], metavar='N', help='Smoothing curve')
-group.add_argument('--min-p', type=float, default=_d['min_p'], metavar='N', help='Min P')
 group.add_argument('--top-p', type=float, default=_d['top_p'], metavar='N', help='Top P')
 group.add_argument('--top-k', type=int, default=_d['top_k'], metavar='N', help='Top K')
+group.add_argument('--min-p', type=float, default=_d['min_p'], metavar='N', help='Min P')
+group.add_argument('--top-n-sigma', type=float, default=_d['top_n_sigma'], metavar='N', help='Top N Sigma')
 group.add_argument('--typical-p', type=float, default=_d['typical_p'], metavar='N', help='Typical P')
 group.add_argument('--xtc-threshold', type=float, default=_d['xtc_threshold'], metavar='N', help='XTC threshold')
 group.add_argument('--xtc-probability', type=float, default=_d['xtc_probability'], metavar='N', help='XTC probability')
@@ -185,7 +186,6 @@ group.add_argument('--epsilon-cutoff', type=float, default=_d['epsilon_cutoff'],
 group.add_argument('--eta-cutoff', type=float, default=_d['eta_cutoff'], metavar='N', help='Eta cutoff')
 group.add_argument('--tfs', type=float, default=_d['tfs'], metavar='N', help='TFS')
 group.add_argument('--top-a', type=float, default=_d['top_a'], metavar='N', help='Top A')
-group.add_argument('--top-n-sigma', type=float, default=_d['top_n_sigma'], metavar='N', help='Top N Sigma')
 group.add_argument('--adaptive-target', type=float, default=_d['adaptive_target'], metavar='N', help='Adaptive target')
 group.add_argument('--adaptive-decay', type=float, default=_d['adaptive_decay'], metavar='N', help='Adaptive decay')
 group.add_argument('--dry-multiplier', type=float, default=_d['dry_multiplier'], metavar='N', help='DRY multiplier')
@@ -292,9 +292,10 @@ settings = {
     'smoothing_curve': neutral_samplers['smoothing_curve'],
 
     # Generation parameters - Curve cutoff
-    'min_p': neutral_samplers['min_p'],
     'top_p': 0.95,
     'top_k': neutral_samplers['top_k'],
+    'min_p': neutral_samplers['min_p'],
+    'top_n_sigma': neutral_samplers['top_n_sigma'],
     'typical_p': neutral_samplers['typical_p'],
     'xtc_threshold': neutral_samplers['xtc_threshold'],
     'xtc_probability': neutral_samplers['xtc_probability'],
@@ -302,7 +303,6 @@ settings = {
     'eta_cutoff': neutral_samplers['eta_cutoff'],
     'tfs': neutral_samplers['tfs'],
     'top_a': neutral_samplers['top_a'],
-    'top_n_sigma': neutral_samplers['top_n_sigma'],
     'adaptive_target': neutral_samplers['adaptive_target'],
     'adaptive_decay': neutral_samplers['adaptive_decay'],
 
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index a5afd7e5..5411b294 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -37,10 +37,10 @@ def create_ui():
                             shared.gradio['dynamic_temperature'] = gr.Checkbox(value=shared.settings['dynamic_temperature'], label='dynamic_temperature')
 
                             gr.Markdown('## Curve cutoff')
-                            shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=shared.settings['min_p'], step=0.01, label='min_p')
-                            shared.gradio['top_n_sigma'] = gr.Slider(0.0, 5.0, value=shared.settings['top_n_sigma'], step=0.01, label='top_n_sigma')
                             shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=shared.settings['top_p'], step=0.01, label='top_p')
                             shared.gradio['top_k'] = gr.Slider(0, 200, value=shared.settings['top_k'], step=1, label='top_k')
+                            shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=shared.settings['min_p'], step=0.01, label='min_p')
+                            shared.gradio['top_n_sigma'] = gr.Slider(0.0, 5.0, value=shared.settings['top_n_sigma'], step=0.01, label='top_n_sigma')
                             shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=shared.settings['typical_p'], step=0.01, label='typical_p')
                             shared.gradio['xtc_threshold'] = gr.Slider(0, 0.5, value=shared.settings['xtc_threshold'], step=0.01, label='xtc_threshold', info='If 2 or more tokens have probability above this threshold, consider removing all but the last one.')
                             shared.gradio['xtc_probability'] = gr.Slider(0, 1, value=shared.settings['xtc_probability'], step=0.01, label='xtc_probability', info='Probability that the removal will actually happen. 0 disables the sampler. 1 makes it always happen.')

From 1a2b84093837fefc4aeb8ddc69923ffed8548158 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 09:52:31 -0700
Subject: [PATCH 108/210] UI: Fix scroll jump when toggling thinking blocks
 during streaming

---
 js/global_scope_js.js | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index ba5abcb2..92f65622 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -420,7 +420,9 @@ function applyMorphdomUpdate(data) {
             }, 0);
           }
         }
+        autoScrollToBottom();
         updateInstructPadding();
+        autoScrollToBottom();
         // Restore scroll state so the browser's layout adjustment
         // from the toggle doesn't disable auto-scroll
         window.isScrolled = wasScrolled;

From f6a749a151a20b0d9027ab7a29797ca5d58a313c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 10:17:31 -0700
Subject: [PATCH 109/210] API: Fix /v1/models to only list the currently loaded
 model

---
 extensions/openai/models.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/extensions/openai/models.py b/extensions/openai/models.py
index 82c65093..c879a860 100644
--- a/extensions/openai/models.py
+++ b/extensions/openai/models.py
@@ -20,10 +20,14 @@ def list_models():
 
 def list_models_openai_format():
     """Returns model list in OpenAI API format"""
-    model_names = get_available_models()
+    if shared.model_name and shared.model_name != 'None':
+        data = [model_info_dict(shared.model_name)]
+    else:
+        data = []
+
     return {
         "object": "list",
-        "data": [model_info_dict(name) for name in model_names]
+        "data": data
     }
 
 

From 92d376e420cba87633cc84c16fcfb8d0c4b0db46 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 13:14:53 -0700
Subject: [PATCH 110/210] web_search: Return all results and improve URL
 extraction

---
 modules/web_search.py         | 33 +++++++++++++++++++--------------
 user_data/tools/web_search.py |  4 +---
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/modules/web_search.py b/modules/web_search.py
index 6d005496..9bebc846 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -4,10 +4,9 @@ import ipaddress
 import random
 import re
 import socket
-import urllib.request
 from concurrent.futures import as_completed
 from datetime import datetime
-from urllib.parse import quote_plus, urljoin, urlparse
+from urllib.parse import parse_qs, quote_plus, urljoin, urlparse
 
 import requests
 
@@ -87,22 +86,28 @@ def perform_web_search(query, num_pages=3, max_workers=5, timeout=10, fetch_cont
             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
         ]
 
-        response_text = ""
-        req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)})
-        with urllib.request.urlopen(req, timeout=timeout) as response:
-            response_text = response.read().decode('utf-8')
+        response = requests.get(search_url, headers={'User-Agent': random.choice(agents)}, timeout=timeout)
+        response.raise_for_status()
+        response_text = response.text
 
-        # Extract results with regex
-        titles = re.findall(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
-        urls = re.findall(r'<a[^>]*class="[^"]*result__url[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
+        # Extract results - title and URL come from the same <a class="result__a"> element
+        result_links = re.findall(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
+        result_tags = re.findall(r'<a([^>]*class="[^"]*result__a[^"]*"[^>]*)>', response_text, re.DOTALL)
 
         # Prepare download tasks
         download_tasks = []
-        for i in range(min(len(titles), len(urls), num_pages)):
-            url = f"https://{urls[i].strip()}"
-            title = re.sub(r'<[^>]+>', '', titles[i]).strip()
-            title = html.unescape(title)
-            download_tasks.append((url, title, i))
+        for i, (tag_attrs, raw_title) in enumerate(zip(result_tags, result_links)):
+            if num_pages is not None and i >= num_pages:
+                break
+            # Extract href and resolve the actual URL from DuckDuckGo's redirect link
+            href_match = re.search(r'href="([^"]*)"', tag_attrs)
+            if not href_match:
+                continue
+            uddg = parse_qs(urlparse(html.unescape(href_match.group(1))).query).get('uddg', [''])[0]
+            if not uddg:
+                continue
+            title = html.unescape(re.sub(r'<[^>]+>', '', raw_title).strip())
+            download_tasks.append((uddg, title, len(download_tasks)))
 
         search_results = [None] * len(download_tasks)  # Pre-allocate to maintain order
 
diff --git a/user_data/tools/web_search.py b/user_data/tools/web_search.py
index 30d13473..6c2b0f0b 100644
--- a/user_data/tools/web_search.py
+++ b/user_data/tools/web_search.py
@@ -9,7 +9,6 @@ tool = {
             "type": "object",
             "properties": {
                 "query": {"type": "string", "description": "The search query."},
-                "num_pages": {"type": "integer", "description": "Number of search results to return (default: 3)."},
             },
             "required": ["query"]
         }
@@ -19,8 +18,7 @@ tool = {
 
 def execute(arguments):
     query = arguments.get("query", "")
-    num_pages = arguments.get("num_pages", 3)
-    results = perform_web_search(query, num_pages=num_pages, fetch_content=False)
+    results = perform_web_search(query, num_pages=None, fetch_content=False)
     output = []
     for r in results:
         if r:

From f8ff7cf99e192d01f9f29bea943e452f2d60fdfa Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 14:12:59 -0700
Subject: [PATCH 111/210] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 14 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index dca686d9..06d4e3d2 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 37cbf729..368ffee1 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index fed46240..660a841b 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index fac36437..ddb0efb5 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index c86caf37..0c0c2416 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 4f5891da..2ec1e61e 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 0471cc73..2bfd2587 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index dfefce20..36a7d185 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 5c032e6b..fd68b533 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 385ecedf..4d5b9678 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index d8f7d494..34ca382b 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index adc6a065..b492acc1 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 942f7a2a..4b548dae 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index fca722fd..0770b12e 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio-4.37.2+custom.10-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.10/gradio_client-1.0.2+custom.10-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 4f80b208597e8b3526155c8b94710a84e9138fa1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 16:38:54 -0700
Subject: [PATCH 112/210] UI: Follow-up to beab346f (fix scroll deadlock on
 chat-parent)

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 30dd28ab..b7135077 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1680,7 +1680,7 @@ button:focus {
 .chat-parent {
     /* Optimize for scrolling performance */
     will-change: scroll-position;
-    contain: layout style paint;
+    contain: style paint;
 
     /* Ensure GPU acceleration */
     transform: translateZ(0);

From c0de1d176cb054b783290da530baa0c0d66a54d5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 17:51:42 -0700
Subject: [PATCH 113/210] UI: Add an incognito chat option

---
 css/main.css       | 43 ++++++++++++++++++++++++++++++++++++++
 js/main.js         | 32 ++++++++++++++++++++++++++++
 modules/chat.py    | 52 +++++++++++++++++++++++++++++++++++-----------
 modules/ui_chat.py |  7 ++++++-
 4 files changed, 121 insertions(+), 13 deletions(-)

diff --git a/css/main.css b/css/main.css
index b7135077..25ae15b1 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1022,6 +1022,49 @@ audio {
     padding-right: 0.5rem;
 }
 
+#new-chat-wrapper {
+    display: contents;
+}
+
+.new-chat-arrow {
+    cursor: pointer;
+    position: relative;
+    padding: 0;
+    margin-right: -15px;
+    height: 39.594px;
+    display: flex;
+    align-items: center;
+}
+
+.new-chat-menu {
+    display: none;
+    position: absolute;
+    top: 0;
+    left: 0;
+    padding-top: 1.2em;
+    z-index: var(--layer-top);
+    white-space: nowrap;
+}
+
+.new-chat-arrow:hover .new-chat-menu {
+    display: block;
+}
+
+.new-chat-menu-item {
+    cursor: pointer;
+    padding: var(--size-2);
+    background: var(--background-fill-primary);
+    box-shadow: var(--shadow-drop-lg);
+    border-radius: var(--container-radius);
+    color: var(--body-text-color);
+    font-size: var(--text-md);
+    font-weight: var(--button-large-text-weight);
+}
+
+.new-chat-menu-item:hover {
+    background: var(--background-fill-secondary);
+}
+
 #past-chats-row,
 #chat-controls {
     width: 260px;
diff --git a/js/main.js b/js/main.js
index 23db59d5..f05f93c6 100644
--- a/js/main.js
+++ b/js/main.js
@@ -552,6 +552,38 @@ document.querySelectorAll(".focus-on-chat-input").forEach(element => {
   });
 });
 
+//------------------------------------------------
+// "New chat" hover menu with incognito option
+//------------------------------------------------
+
+(function() {
+  const newChatBtn = document.getElementById("new-chat-btn");
+
+  const wrapper = document.createElement("div");
+  wrapper.id = "new-chat-wrapper";
+  newChatBtn.replaceWith(wrapper);
+  wrapper.appendChild(newChatBtn);
+
+  const arrow = document.createElement("span");
+  arrow.className = "new-chat-arrow";
+  arrow.textContent = "\u25BE";
+
+  const menu = document.createElement("div");
+  menu.className = "new-chat-menu";
+  const option = document.createElement("div");
+  option.className = "new-chat-menu-item";
+  option.textContent = "Incognito chat";
+  menu.appendChild(option);
+
+  arrow.appendChild(menu);
+  wrapper.appendChild(arrow);
+
+  option.addEventListener("click", function(e) {
+    e.stopPropagation();
+    document.querySelector("#incognito-chat-btn").click();
+  });
+})();
+
 //------------------------------------------------
 // Fix a border around the "past chats" menu
 //------------------------------------------------
diff --git a/modules/chat.py b/modules/chat.py
index bcb548fd..e4fcaabe 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1528,7 +1528,7 @@ def redraw_html(history, name1, name2, mode, style, character, reset_cache=False
     return chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=reset_cache)
 
 
-def start_new_chat(state):
+def start_new_chat(state, unique_id=None):
     mode = state['mode']
     # Initialize with empty metadata dictionary
     history = {'internal': [], 'visible': [], 'metadata': {}}
@@ -1542,7 +1542,9 @@ def start_new_chat(state):
             # Add timestamp for assistant's greeting
             update_message_metadata(history['metadata'], "assistant", 0, timestamp=get_current_timestamp())
 
-    unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
+    if unique_id is None:
+        unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
+
     save_history(history, unique_id, state['character_menu'], state['mode'])
 
     return history
@@ -1561,6 +1563,9 @@ def save_history(history, unique_id, character, mode):
     if shared.args.multi_user:
         return
 
+    if unique_id and unique_id.startswith('incognito-'):
+        return
+
     p = get_history_file_path(unique_id, character, mode)
     if not p.parent.is_dir():
         p.parent.mkdir(parents=True)
@@ -1750,6 +1755,9 @@ def save_last_chat_state(character, mode, unique_id):
     if shared.args.multi_user:
         return
 
+    if unique_id and unique_id.startswith('incognito-'):
+        return
+
     state = load_last_chat_state()
     key = get_chat_state_key(character, mode)
     state["last_chats"][key] = unique_id
@@ -2290,11 +2298,29 @@ def handle_start_new_chat_click(state):
     return [history, html, past_chats_update]
 
 
-def handle_delete_chat_confirm_click(state):
+def handle_start_incognito_chat_click(state):
     import gradio as gr
+    unique_id = 'incognito-' + datetime.now().strftime('%Y%m%d-%H-%M-%S')
+    history = start_new_chat(state, unique_id=unique_id)
+    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+
+    convert_to_markdown.cache_clear()
+
+    histories = find_all_histories_with_first_prompts(state)
+    past_chats_update = gr.update(choices=histories, value=unique_id)
+
+    return [history, html, past_chats_update]
+
+
+def handle_delete_chat_confirm_click(state):
     filtered_histories = find_all_histories_with_first_prompts(state)
     filtered_ids = [h[1] for h in filtered_histories]
-    index = str(filtered_ids.index(state['unique_id']))
+
+    if state['unique_id'] not in filtered_ids:
+        # Incognito or unknown chat — just load the most recent saved chat
+        index = '0'
+    else:
+        index = str(filtered_ids.index(state['unique_id']))
 
     delete_history(state['unique_id'], state['character_menu'], state['mode'])
     history, unique_id = load_history_after_deletion(state, index)
@@ -2302,13 +2328,7 @@ def handle_delete_chat_confirm_click(state):
 
     convert_to_markdown.cache_clear()
 
-    return [
-        history,
-        html,
-        unique_id,
-        gr.update(visible=False),
-        gr.update(visible=True),
-    ]
+    return [history, html, unique_id]
 
 
 def handle_branch_chat_click(state):
@@ -2324,7 +2344,8 @@ def handle_branch_chat_click(state):
         if 'metadata' in history:
             history['metadata'] = {k: v for k, v in history['metadata'].items() if int(k.split('_')[-1]) <= branch_from_index}
 
-    new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
+    prefix = 'incognito-' if state['unique_id'] and state['unique_id'].startswith('incognito-') else ''
+    new_unique_id = prefix + datetime.now().strftime('%Y%m%d-%H-%M-%S')
     save_history(history, new_unique_id, state['character_menu'], state['mode'])
 
     histories = find_all_histories_with_first_prompts(state)
@@ -2446,6 +2467,13 @@ def handle_rename_chat_click():
 
 def handle_rename_chat_confirm(rename_to, state):
     import gradio as gr
+
+    if state['unique_id'] and state['unique_id'].startswith('incognito-'):
+        return [
+            gr.update(),
+            gr.update(visible=False),
+        ]
+
     rename_history(state['unique_id'], rename_to, state['character_menu'], state['mode'])
     histories = find_all_histories_with_first_prompts(state)
 
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 0acf9c04..d2a515b8 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -28,7 +28,8 @@ def create_ui():
                     shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes=['refresh-button', 'refresh-button-medium'], elem_id='Branch', interactive=not mu)
                     shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes=['refresh-button', 'refresh-button-medium'], interactive=not mu)
                     shared.gradio['delete_chat'] = gr.Button('🗑️', visible=False, elem_classes='refresh-button', interactive=not mu, elem_id='delete_chat')
-                    shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'refresh-button-medium', 'focus-on-chat-input'])
+                    shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'refresh-button-medium', 'focus-on-chat-input'], elem_id='new-chat-btn')
+                    shared.gradio['Start incognito chat'] = gr.Button('Incognito chat', visible=False, elem_id='incognito-chat-btn')
                     shared.gradio['branch_index'] = gr.Number(value=-1, precision=0, visible=False, elem_id="Branch-index", interactive=True)
 
                 shared.gradio['search_chat'] = gr.Textbox(placeholder='Search chats...', max_lines=1, elem_id='search_chat')
@@ -290,6 +291,10 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.handle_start_new_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
 
+    shared.gradio['Start incognito chat'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_start_incognito_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
+
     shared.gradio['delete_chat-confirm'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.handle_delete_chat_confirm_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)

From b76a289e048b9c217d54397705c0e0f5faf15dbe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 22:03:29 -0300
Subject: [PATCH 114/210] API: Respect --listen-host for the OpenAI API server

Closes #7429
---
 extensions/openai/script.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index f161e1e4..a0d5deb8 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -458,10 +458,13 @@ def run_server():
 
     # In the server configuration:
     server_addrs = []
-    if os.environ.get('OPENEDAI_ENABLE_IPV6', shared.args.api_enable_ipv6):
-        server_addrs.append('[::]' if shared.args.listen else '[::1]')
-    if not os.environ.get('OPENEDAI_DISABLE_IPV4', shared.args.api_disable_ipv4):
-        server_addrs.append('0.0.0.0' if shared.args.listen else '127.0.0.1')
+    if shared.args.listen and shared.args.listen_host:
+        server_addrs.append(shared.args.listen_host)
+    else:
+        if os.environ.get('OPENEDAI_ENABLE_IPV6', shared.args.api_enable_ipv6):
+            server_addrs.append('[::]' if shared.args.listen else '[::1]')
+        if not os.environ.get('OPENEDAI_DISABLE_IPV4', shared.args.api_disable_ipv4):
+            server_addrs.append('0.0.0.0' if shared.args.listen else '127.0.0.1')
 
     if not server_addrs:
         raise Exception('you MUST enable IPv6 or IPv4 for the API to work')

From 5cfe9fe2954ad45fe8d485d20baeb29546e040e7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 20:05:56 -0700
Subject: [PATCH 115/210] Update README

---
 README.md | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index f1527176..c3450bf8 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 
 # Text Generation Web UI
 
-Run large language models locally with full privacy. Supports text generation, vision, image generation, training, tool-calling, and more — across multiple backends including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). 100% offline, zero telemetry.
+A Gradio web UI for running Large Language Models locally. 100% private and offline. Supports text generation, vision, tool-calling, training, image generation, and more.
 
 [Try the Deep Reason extension](https://oobabooga.gumroad.com/l/deep_reason)
 
@@ -24,12 +24,12 @@ Run large language models locally with full privacy. Supports text generation, v
 ## Features
 
 - **Multiple backends**: [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Switch between backends and models without restarting.
-- **Vision (multimodal)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
-- **Image generation**: A dedicated tab for `diffusers` models like **Z-Image-Turbo**. Features 4-bit/8-bit quantization and a persistent gallery with metadata ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Image-Generation-Tutorial)).
-- **Training**: Fine-tune LoRAs on multi-turn chat or raw text datasets. Supports resuming interrupted runs ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/05-%E2%80%90-Training-Tab)).
-- **OpenAI-compatible API**: Chat and Completions endpoints with tool-calling support — use as a local drop-in replacement for the OpenAI API ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
-- **Web search**: Search the internet with LLM-generated queries to add context to conversations.
 - **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
+- **Vision (multimodal)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
+- **Web search**: Search the internet with LLM-generated queries to add context to conversations.
+- **OpenAI-compatible API**: Chat and Completions endpoints with tool-calling support — use as a local drop-in replacement for the OpenAI API ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
+- **Training**: Fine-tune LoRAs on multi-turn chat or raw text datasets. Supports resuming interrupted runs ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/05-%E2%80%90-Training-Tab)).
+- **Image generation**: A dedicated tab for `diffusers` models like **Z-Image-Turbo**. Features 4-bit/8-bit quantization and a persistent gallery with metadata ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Image-Generation-Tutorial)).
 - **Easy setup**: [Portable builds](https://github.com/oobabooga/text-generation-webui/releases) (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or a one-click installer for the full feature set.
 - 100% offline and private, with zero telemetry, external resources, or remote update requests.
 - `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters. Prompts are automatically formatted with Jinja2 templates.
@@ -46,10 +46,11 @@ Run large language models locally with full privacy. Supports text generation, v
 
 No installation needed – just download, unzip and run. All dependencies included.
 
-Compatible with GGUF (llama.cpp) models on Windows, Linux, and macOS. [Check what models fit your hardware](https://huggingface.co/spaces/oobabooga/accurate-gguf-vram-calculator).
-
 Download from here: **https://github.com/oobabooga/text-generation-webui/releases**
 
+- Builds are provided for Linux, Windows, and macOS, with options for CUDA, Vulkan, ROCm, and CPU-only.
+- Compatible with GGUF (llama.cpp) models.
+
 #### Option 2: Manual portable install with venv
 
 Very fast setup that should work on any Python 3.9+:
@@ -423,7 +424,10 @@ API generation defaults:
 
 ## Downloading models
 
-Download a GGUF model file from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads&search=gguf) and place it in the `user_data/models` folder. That's it — the UI will detect it automatically.
+1. Download a GGUF model file from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads&search=gguf).
+2. Place it in the `user_data/models` folder.
+
+That's it. The UI will detect it automatically.
 
 Not sure what will fit your GPU? Use the [VRAM Calculator](https://huggingface.co/spaces/oobabooga/accurate-gguf-vram-calculator).
 

From 9d9f5d98604da58e79911a98b6a714f968b27ac9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 15 Mar 2026 20:27:44 -0700
Subject: [PATCH 116/210] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c3450bf8..b75e2c11 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ A Gradio web UI for running Large Language Models locally. 100% private and offl
 - **Multiple backends**: [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Switch between backends and models without restarting.
 - **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
 - **Vision (multimodal)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
-- **Web search**: Search the internet with LLM-generated queries to add context to conversations.
+- **Tool-calling**: Models can call custom functions during chat — web search, page fetching, math, and more. Each tool is a single `.py` file, easy to create and extend ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Tool-Calling-Tutorial)).
 - **OpenAI-compatible API**: Chat and Completions endpoints with tool-calling support — use as a local drop-in replacement for the OpenAI API ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
 - **Training**: Fine-tune LoRAs on multi-turn chat or raw text datasets. Supports resuming interrupted runs ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/05-%E2%80%90-Training-Tab)).
 - **Image generation**: A dedicated tab for `diffusers` models like **Z-Image-Turbo**. Features 4-bit/8-bit quantization and a persistent gallery with metadata ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Image-Generation-Tutorial)).

From 50685c93f2e273fd8dd46b1f790736e62f61f0d8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 16 Mar 2026 05:29:27 -0700
Subject: [PATCH 117/210] Update README

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b75e2c11..989659d1 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ A Gradio web UI for running Large Language Models locally. 100% private and offl
 - **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
 - **Vision (multimodal)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
 - **Tool-calling**: Models can call custom functions during chat — web search, page fetching, math, and more. Each tool is a single `.py` file, easy to create and extend ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Tool-Calling-Tutorial)).
-- **OpenAI-compatible API**: Chat and Completions endpoints with tool-calling support — use as a local drop-in replacement for the OpenAI API ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
+- **OpenAI-compatible API**: Chat and Completions endpoints with tool-calling support. Use as a local drop-in replacement for the OpenAI API ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
 - **Training**: Fine-tune LoRAs on multi-turn chat or raw text datasets. Supports resuming interrupted runs ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/05-%E2%80%90-Training-Tab)).
 - **Image generation**: A dedicated tab for `diffusers` models like **Z-Image-Turbo**. Features 4-bit/8-bit quantization and a persistent gallery with metadata ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Image-Generation-Tutorial)).
 - **Easy setup**: [Portable builds](https://github.com/oobabooga/text-generation-webui/releases) (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or a one-click installer for the full feature set.
@@ -81,7 +81,7 @@ deactivate
 
 #### Option 3: One-click installer
 
-For users who need additional backends (ExLlamaV3, Transformers) or extensions (TTS, voice input, translation, etc). Requires ~10GB disk space and downloads PyTorch.
+For users who need additional backends (ExLlamaV3, Transformers), training, image generation, or extensions (TTS, voice input, translation, etc). Requires ~10GB disk space and downloads PyTorch.
 
 1. Clone the repository, or [download its source code](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip) and extract it.
 2. Run the startup script for your OS: `start_windows.bat`, `start_linux.sh`, or `start_macos.sh`.
@@ -429,7 +429,7 @@ API generation defaults:
 
 That's it. The UI will detect it automatically.
 
-Not sure what will fit your GPU? Use the [VRAM Calculator](https://huggingface.co/spaces/oobabooga/accurate-gguf-vram-calculator).
+To check what will fit your GPU, you can use the [VRAM Calculator](https://huggingface.co/spaces/oobabooga/accurate-gguf-vram-calculator).
 
 <details>
 <summary>Other model types (Transformers, EXL3)</summary>

From 737ded695913de99db88d90384bde552d297dde9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 16 Mar 2026 05:37:46 -0700
Subject: [PATCH 118/210] Web search: Fix SSRF validation to block all
 non-global IPs

---
 modules/web_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/web_search.py b/modules/web_search.py
index 9bebc846..e13ef62a 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -28,8 +28,8 @@ def _validate_url(url):
     try:
         for family, _, _, _, sockaddr in socket.getaddrinfo(hostname, None):
             ip = ipaddress.ip_address(sockaddr[0])
-            if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
-                raise ValueError(f"Access to private/internal address {ip} is blocked")
+            if not ip.is_global:
+                raise ValueError(f"Access to non-public address {ip} is blocked")
     except socket.gaierror:
         raise ValueError(f"Could not resolve hostname: {hostname}")
 

From 6c05a964a75af8d6e75c053ee8642eb787151029 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 16 Mar 2026 06:00:16 -0700
Subject: [PATCH 119/210] docs: Mention supported tool-calling models

---
 docs/Tool Calling Tutorial.md | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/docs/Tool Calling Tutorial.md b/docs/Tool Calling Tutorial.md
index 170bdff7..801e9d78 100644
--- a/docs/Tool Calling Tutorial.md	
+++ b/docs/Tool Calling Tutorial.md	
@@ -1,8 +1,23 @@
+## Supported models
+
+The following models are supported:
+
+- Qwen 3.5
+- GPT-OSS
+- Mistral Small / Devstral
+- DeepSeek V3
+- Kimi-K2
+- MiniMax-M2.5
+- GLM-5
+- Llama 4
+
+Other models that output tool calls as JSON (inside XML tags, code blocks, or plain JSON) are also supported through a generic fallback parser.
+
 ## Tool calling in the UI
 
 ### 1. Load a model with tool-calling support
 
-Load a model with tool-calling support (Qwen, Mistral, Llama 4, etc.) from the Model tab.
+Load a model with tool-calling support from the Model tab.
 
 ### 2. Select tools
 

From 44810751de7badd8d60677cc9ae3b1469cfdb0ec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 16 Mar 2026 06:21:14 -0700
Subject: [PATCH 120/210] Update llama.cpp

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 12 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 06d4e3d2..c24f4a9d 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.25/exllamav3-0.0.25+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.25/exllamav3-0.0.25+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 368ffee1..7c481224 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 660a841b..b1c8f78e 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index ddb0efb5..63ef33ea 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 0c0c2416..4bc61622 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 2bfd2587..ba4c7a04 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 36a7d185..5dfdd9c8 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index fd68b533..f62241b3 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 4d5b9678..353d9172 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 34ca382b..5f039318 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index b492acc1..d8b03102 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 0770b12e..fd2511f4 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.93.0/llama_cpp_binaries-0.93.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 1c89376370b63ad32fef472114ec036edaaf8d1c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 16 Mar 2026 15:23:24 -0700
Subject: [PATCH 121/210] training: Add gradient_checkpointing for lower VRAM
 by default

---
 docs/05 - Training Tab.md | 2 ++
 modules/training.py       | 8 +++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/05 - Training Tab.md b/docs/05 - Training Tab.md
index 0bfc59aa..46424eab 100644
--- a/docs/05 - Training Tab.md	
+++ b/docs/05 - Training Tab.md	
@@ -100,6 +100,8 @@ Each parameter has a description in the UI. Below is guidance on the most import
 
 VRAM usage during training is roughly similar to inference with ~1000 tokens of context. If you can run the model, you can probably train LoRAs with the default settings. If you run out of VRAM, reduce `Micro Batch Size` or `Cutoff Length`. Training 4-bit quantized models uses more VRAM — set `Micro Batch Size` to `1` to compensate.
 
+**Gradient checkpointing** is enabled by default. It reduces VRAM usage by recomputing activations during the backward pass instead of storing them in memory. The tradeoff is ~20-30% slower training. There is no impact on accuracy — the results are mathematically identical. The savings are most noticeable with longer sequences and larger batch sizes. You can disable it if you have VRAM to spare and want faster training.
+
 ### Rank
 
 Higher rank = more learning capacity = larger adapter = more VRAM. Use 4–8 for style/format, 128–256 to teach factual knowledge.
diff --git a/modules/training.py b/modules/training.py
index 878bb222..6549b35e 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -26,7 +26,7 @@ from modules.evaluate import (
 from modules.logging_colors import logger
 from modules.models import reload_model
 
-PARAMETERS = ["lora_name", "always_override", "all_linear", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "text_dataset", "higher_rank_limit", "warmup_steps", "optimizer", "stride_length", "stop_at_loss", "add_eos_token", "excess_length", "report_to"]
+PARAMETERS = ["lora_name", "always_override", "all_linear", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "text_dataset", "higher_rank_limit", "warmup_steps", "optimizer", "stride_length", "stop_at_loss", "add_eos_token", "excess_length", "report_to", "gradient_checkpointing"]
 WANT_INTERRUPT = False
 
 train_log = {}
@@ -101,6 +101,7 @@ def create_ui():
                                 add_eos_token = gr.Checkbox(label='Add EOS token', value=True, info="Adds EOS token for each document in text datasets.")
                                 excess_length = gr.Dropdown(label='Excess length', value='drop', choices=['drop', 'truncate'], info='What to do with conversations that exceed the cutoff length. "Drop" removes them entirely (recommended). "Truncate" cuts from the right, which may produce incomplete responses.', elem_classes=['slim-dropdown'])
 
+                                gradient_checkpointing = gr.Checkbox(label='Gradient checkpointing', value=True, info='Trades ~20-30% training speed for reduced VRAM usage by recomputing activations during the backward pass instead of storing them. No impact on accuracy.')
                                 higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
                                 report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
 
@@ -159,7 +160,7 @@ def create_ui():
                 refresh_table = gr.Button('Refresh the table', elem_classes="small-button", interactive=not mu)
 
     # Training events
-    all_params = [lora_name, always_override, all_linear, q_proj_en, v_proj_en, k_proj_en, o_proj_en, gate_proj_en, down_proj_en, up_proj_en, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, text_dataset, higher_rank_limit, warmup_steps, optimizer, stride_length, stop_at_loss, add_eos_token, excess_length, report_to]
+    all_params = [lora_name, always_override, all_linear, q_proj_en, v_proj_en, k_proj_en, o_proj_en, gate_proj_en, down_proj_en, up_proj_en, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, text_dataset, higher_rank_limit, warmup_steps, optimizer, stride_length, stop_at_loss, add_eos_token, excess_length, report_to, gradient_checkpointing]
 
     copy_from.change(do_copy_params, [copy_from] + all_params, all_params)
     start_button.click(do_train, all_params, output)
@@ -293,7 +294,7 @@ def calc_trainable_parameters(model):
     return trainable_params, all_param
 
 
-def do_train(lora_name: str, always_override: bool, all_linear: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, text_dataset: str, higher_rank_limit: bool, warmup_steps: int, optimizer: str, stride_length: int, stop_at_loss: float, add_eos_token: bool, excess_length: str, report_to: str):
+def do_train(lora_name: str, always_override: bool, all_linear: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, text_dataset: str, higher_rank_limit: bool, warmup_steps: int, optimizer: str, stride_length: int, stop_at_loss: float, add_eos_token: bool, excess_length: str, report_to: str, gradient_checkpointing: bool = True):
 
     import torch
     import transformers
@@ -708,6 +709,7 @@ def do_train(lora_name: str, always_override: bool, all_linear: bool, q_proj_en:
             load_best_model_at_end=eval_data is not None,
             # TODO: Enable multi-device support
             ddp_find_unused_parameters=None,
+            gradient_checkpointing=gradient_checkpointing,
             use_cpu=shared.args.cpu,
             remove_unused_columns=False,
         ),

From 22ff5044b0ccd12e2ab1181e4dd3d503a7b0ae2c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 16 Mar 2026 16:01:28 -0700
Subject: [PATCH 122/210] training: Organize the UI

---
 modules/training.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 6549b35e..7cb50068 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -90,19 +90,16 @@ def create_ui():
                     with gr.Accordion(label='Advanced Options', open=False, elem_classes='tgw-accordion'):
                         with gr.Row():
                             with gr.Column():
+                                optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Optimizer algorithm. adamw_torch is the standard choice. adamw_bnb_8bit uses less VRAM. adafactor is memory-efficient for large models.', elem_classes=['slim-dropdown'])
+                                warmup_steps = gr.Number(label='Warmup Steps', value=100, info='For this many steps at the start, the learning rate is gradually ramped up from 0 to the target value. This prevents unstable updates early in training.')
                                 lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.0, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
                                 stop_at_loss = gr.Slider(label='Stop at loss', minimum=0.0, maximum=3.0, step=0.1, value=0.00, info='The process will automatically stop once the desired loss value is reached. (reasonable numbers are 1.5-1.8)')
-                                with gr.Row():
-                                    optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Optimizer algorithm. adamw_torch is the standard choice. adamw_bnb_8bit uses less VRAM. adafactor is memory-efficient for large models.', elem_classes=['slim-dropdown'])
 
                             with gr.Column():
-                                warmup_steps = gr.Number(label='Warmup Steps', value=100, info='For this many steps at the start, the learning rate is gradually ramped up from 0 to the target value. This prevents unstable updates early in training.')
-
-                                add_eos_token = gr.Checkbox(label='Add EOS token', value=True, info="Adds EOS token for each document in text datasets.")
-                                excess_length = gr.Dropdown(label='Excess length', value='drop', choices=['drop', 'truncate'], info='What to do with conversations that exceed the cutoff length. "Drop" removes them entirely (recommended). "Truncate" cuts from the right, which may produce incomplete responses.', elem_classes=['slim-dropdown'])
-
                                 gradient_checkpointing = gr.Checkbox(label='Gradient checkpointing', value=True, info='Trades ~20-30% training speed for reduced VRAM usage by recomputing activations during the backward pass instead of storing them. No impact on accuracy.')
+                                add_eos_token = gr.Checkbox(label='Add EOS token', value=True, info="Adds EOS token for each document in text datasets.")
                                 higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
+                                excess_length = gr.Dropdown(label='Excess length', value='drop', choices=['drop', 'truncate'], info='What to do with conversations that exceed the cutoff length. "Drop" removes them entirely (recommended). "Truncate" cuts from the right, which may produce incomplete responses.', elem_classes=['slim-dropdown'])
                                 report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
 
                 with gr.Column():

From 238cbd5656a9007d7e4f5ff39a04e1d340b9e50c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 16 Mar 2026 16:05:43 -0700
Subject: [PATCH 123/210] training: Remove arbitrary higher_rank_limit
 parameter

---
 modules/training.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 7cb50068..db7b206b 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -26,7 +26,7 @@ from modules.evaluate import (
 from modules.logging_colors import logger
 from modules.models import reload_model
 
-PARAMETERS = ["lora_name", "always_override", "all_linear", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "text_dataset", "higher_rank_limit", "warmup_steps", "optimizer", "stride_length", "stop_at_loss", "add_eos_token", "excess_length", "report_to", "gradient_checkpointing"]
+PARAMETERS = ["lora_name", "always_override", "all_linear", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "text_dataset", "warmup_steps", "optimizer", "stride_length", "stop_at_loss", "add_eos_token", "excess_length", "report_to", "gradient_checkpointing"]
 WANT_INTERRUPT = False
 
 train_log = {}
@@ -73,8 +73,8 @@ def create_ui():
 
                     with gr.Row():
                         with gr.Column():
-                            lora_rank = gr.Slider(label='LoRA Rank', value=8, minimum=0, maximum=1024, step=4, info='Also called dimension count. Higher values = larger file, more content control. Smaller values = smaller file, less control. Use 4 or 8 for style, 128 or 256 to teach, 1024+ for fine-detail on big data. More VRAM is needed for higher ranks.')
-                            lora_alpha = gr.Slider(label='LoRA Alpha', value=16, minimum=0, maximum=2048, step=4, info='This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
+                            lora_rank = gr.Slider(label='LoRA Rank', value=8, minimum=0, maximum=2048, step=4, info='Also called dimension count. Use 4–8 for style/format, 128–256 to teach factual knowledge, 1024+ for comprehensive fine-tuning. Very high ranks require significant VRAM.')
+                            lora_alpha = gr.Slider(label='LoRA Alpha', value=16, minimum=0, maximum=4096, step=4, info='This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
                             batch_size = gr.Slider(label='Batch Size', value=32, minimum=0, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
                             micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
                             cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=4096, value=512, step=32, info='Maximum sequence length in tokens. For instruction datasets, conversations longer than this are dropped. For text datasets, documents are split into chunks of this size. Higher values require more VRAM.')
@@ -98,7 +98,6 @@ def create_ui():
                             with gr.Column():
                                 gradient_checkpointing = gr.Checkbox(label='Gradient checkpointing', value=True, info='Trades ~20-30% training speed for reduced VRAM usage by recomputing activations during the backward pass instead of storing them. No impact on accuracy.')
                                 add_eos_token = gr.Checkbox(label='Add EOS token', value=True, info="Adds EOS token for each document in text datasets.")
-                                higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
                                 excess_length = gr.Dropdown(label='Excess length', value='drop', choices=['drop', 'truncate'], info='What to do with conversations that exceed the cutoff length. "Drop" removes them entirely (recommended). "Truncate" cuts from the right, which may produce incomplete responses.', elem_classes=['slim-dropdown'])
                                 report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
 
@@ -157,12 +156,12 @@ def create_ui():
                 refresh_table = gr.Button('Refresh the table', elem_classes="small-button", interactive=not mu)
 
     # Training events
-    all_params = [lora_name, always_override, all_linear, q_proj_en, v_proj_en, k_proj_en, o_proj_en, gate_proj_en, down_proj_en, up_proj_en, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, text_dataset, higher_rank_limit, warmup_steps, optimizer, stride_length, stop_at_loss, add_eos_token, excess_length, report_to, gradient_checkpointing]
+    all_params = [lora_name, always_override, all_linear, q_proj_en, v_proj_en, k_proj_en, o_proj_en, gate_proj_en, down_proj_en, up_proj_en, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, text_dataset, warmup_steps, optimizer, stride_length, stop_at_loss, add_eos_token, excess_length, report_to, gradient_checkpointing]
 
     copy_from.change(do_copy_params, [copy_from] + all_params, all_params)
     start_button.click(do_train, all_params, output)
     stop_button.click(do_interrupt, None, None, queue=False)
-    higher_rank_limit.change(change_rank_limit, [higher_rank_limit], [lora_rank, lora_alpha])
+
 
     # Evaluation events. For some reason, the interrupt event
     # doesn't work with the .then() syntax, so I write them one
@@ -207,10 +206,6 @@ def do_copy_params(lora_name: str, *args):
     return result
 
 
-def change_rank_limit(use_higher_ranks: bool):
-    mult = 2 if use_higher_ranks else 1
-    return {"maximum": 1024 * mult, "__type__": "update"}, {"maximum": 2048 * mult, "__type__": "update"}
-
 
 def clean_path(base_path: str, path: str):
     """Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
@@ -291,7 +286,7 @@ def calc_trainable_parameters(model):
     return trainable_params, all_param
 
 
-def do_train(lora_name: str, always_override: bool, all_linear: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, text_dataset: str, higher_rank_limit: bool, warmup_steps: int, optimizer: str, stride_length: int, stop_at_loss: float, add_eos_token: bool, excess_length: str, report_to: str, gradient_checkpointing: bool = True):
+def do_train(lora_name: str, always_override: bool, all_linear: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, text_dataset: str, warmup_steps: int, optimizer: str, stride_length: int, stop_at_loss: float, add_eos_token: bool, excess_length: str, report_to: str, gradient_checkpointing: bool = True):
 
     import torch
     import transformers

From 9d02d3a13b2e39a2a3bf91d8936044f9bbd9fd49 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 16 Mar 2026 16:08:06 -0700
Subject: [PATCH 124/210] docs: Minor change to tool calling tutorial

---
 docs/Tool Calling Tutorial.md | 40 +++++++++++++++++------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/docs/Tool Calling Tutorial.md b/docs/Tool Calling Tutorial.md
index 801e9d78..d95a9c80 100644
--- a/docs/Tool Calling Tutorial.md	
+++ b/docs/Tool Calling Tutorial.md	
@@ -1,18 +1,3 @@
-## Supported models
-
-The following models are supported:
-
-- Qwen 3.5
-- GPT-OSS
-- Mistral Small / Devstral
-- DeepSeek V3
-- Kimi-K2
-- MiniMax-M2.5
-- GLM-5
-- Llama 4
-
-Other models that output tool calls as JSON (inside XML tags, code blocks, or plain JSON) are also supported through a generic fallback parser.
-
 ## Tool calling in the UI
 
 ### 1. Load a model with tool-calling support
@@ -23,11 +8,11 @@ Load a model with tool-calling support from the Model tab.
 
 In the chat sidebar, check the tools you want the model to use:
 
-- **web_search** -- Search the web using DuckDuckGo.
-- **fetch_webpage** -- Fetch the content of a URL.
-- **calculate** -- Evaluate math expressions.
-- **get_datetime** -- Get the current date and time.
-- **roll_dice** -- Roll dice.
+- `web_search`: Search the web using DuckDuckGo.
+- `fetch_webpage`: Fetch the content of a URL.
+- `calculate`: Evaluate math expressions.
+- `get_datetime`: Get the current date and time.
+- `roll_dice`: Roll dice.
 
 ### 3. Chat
 
@@ -157,3 +142,18 @@ for _ in range(10):
         print(f"\nAssistant: {choice['message']['content']}")
         break
 ```
+
+## Supported models
+
+The following models are supported:
+
+- Qwen 3.5
+- GPT-OSS
+- Mistral Small / Devstral
+- DeepSeek V3
+- Kimi-K2
+- MiniMax-M2.5
+- GLM-5
+- Llama 4
+
+Other models that output tool calls as JSON (inside XML tags, code blocks, or plain JSON) are also supported through a generic fallback parser.

From dff8903b03c2b3e46e11c16862f12db0495b3e91 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 16 Mar 2026 18:25:54 -0700
Subject: [PATCH 125/210] UI: Modernize the Gradio theme

---
 css/main.css          | 73 +++++++++++++++++++++++++++++--------------
 modules/ui.py         | 26 +++++++++------
 modules/ui_session.py |  2 +-
 3 files changed, 68 insertions(+), 33 deletions(-)

diff --git a/css/main.css b/css/main.css
index 25ae15b1..22fac5c5 100644
--- a/css/main.css
+++ b/css/main.css
@@ -2,8 +2,8 @@
     --darker-gray: #1C1C1D;
     --dark-gray: #212125;
     --light-gray: #2C2E34;
-    --light-theme-gray: #f9fbff;
-    --border-color-dark: #525252;
+    --light-theme-gray: #f0f3fb;
+    --border-color-dark: rgba(255, 255, 255, 0.15);
     --header-width: 112px;
     --selected-item-color-dark: #282930;
 }
@@ -127,7 +127,7 @@ gradio-app > :first-child {
 }
 
 .header_bar {
-    border-right: var(--input-border-width) solid var(--input-border-color);
+    border-right: none;
     margin-bottom: 0;
     overflow-x: scroll;
     text-wrap: nowrap;
@@ -150,7 +150,7 @@ gradio-app > :first-child {
 
 .dark .header_bar {
     border: none !important;
-    box-shadow: 0 3px 4px rgba(20 20 20 / 60%);
+    box-shadow: none;
     background-color: #8080802b;
 }
 
@@ -268,17 +268,17 @@ button {
 .dark #image-history-gallery > :nth-child(2)::-webkit-scrollbar-thumb,
 .dark #image-history-gallery > :nth-child(2)::-webkit-scrollbar-thumb:hover {
     background: rgb(255 255 255 / 6.25%);
-    border-radius: 10px;
+    border-radius: 30px;
 }
 
 .pretty_scrollbar::-webkit-resizer,
 #image-history-gallery > :nth-child(2)::-webkit-resizer {
-    background: #c5c5d2;
+    background: #d2d2d8;
 }
 
 .dark .pretty_scrollbar::-webkit-resizer,
 .dark #image-history-gallery > :nth-child(2)::-webkit-resizer {
-    background: #ccc;
+    background: rgb(255 255 255 / 10%);
     border-radius: 10px;
 }
 
@@ -582,10 +582,28 @@ audio {
 
 #chat-input textarea {
     background: #f3f4f6;
-    padding: 0.65rem 2.5rem;
-    border: 0;
-    box-shadow: 0;
-    border-radius: 8px;
+    padding: 0.65rem 2.5rem 0.6rem;
+    margin-top: 0.15rem;
+    border: 1px solid #d2d2d8;
+    border-radius: 1.5rem;
+    overflow-y: auto !important;
+}
+
+#chat-input textarea::-webkit-scrollbar {
+    width: 8px;
+}
+
+#chat-input textarea::-webkit-scrollbar-track {
+    background: transparent;
+}
+
+#chat-input textarea::-webkit-scrollbar-thumb {
+    background: var(--neutral-300);
+    border-radius: 30px;
+}
+
+.dark #chat-input textarea::-webkit-scrollbar-thumb {
+    background: rgb(255 255 255 / 6.25%);
 }
 
 #chat-input textarea::placeholder {
@@ -725,10 +743,12 @@ audio {
     position: absolute;
     bottom: 100%;
     left: 0;
-    box-shadow: 0 0 5px rgb(0 0 0 / 25%);
+    box-shadow: 0 2px 12px rgb(0 0 0 / 15%);
+    border-radius: 0.5rem;
     z-index: 10000;
     min-width: 330px;
     flex-direction: column;
+    overflow: hidden;
 }
 
 .hover-menu button {
@@ -739,6 +759,7 @@ audio {
     margin: 0 !important;
     height: 36px;
     border-color: transparent !important;
+    transition: background-color 0.15s ease;
 }
 
 .hover-menu button:not(#clear-history-confirm) {
@@ -914,7 +935,7 @@ audio {
 .options {
     z-index: 100 !important;
     border: 1px solid var(--input-border-color);
-    border-radius: 0;
+    border-radius: 0.5rem;
 }
 
 /* ----------------------------------------------
@@ -1008,9 +1029,13 @@ audio {
     cursor: pointer;
 }
 
+#past-chats label {
+    transition: background-color 0.15s ease;
+}
+
 #past-chats .selected,
 #past-chats label:hover {
-    background-color: #dbeafe !important;
+    background-color: #c8d8f5 !important;
 }
 
 #past-chats-buttons,
@@ -1166,7 +1191,7 @@ audio {
   Dark theme
 ---------------------------------------------- */
 .dark .header_bar {
-    background-color: var(--darker-gray) !important;
+    background-color: #1a1a1a !important;
 }
 
 .dark .header_bar button.selected {
@@ -1176,7 +1201,7 @@ audio {
 .dark #chat-input textarea {
     background: var(--light-gray);
     color: white !important;
-    border-color: #292c3b;
+    border-color: rgba(255, 255, 255, 0.06);
 }
 
 .dark #chat-input textarea::placeholder {
@@ -1192,6 +1217,7 @@ audio {
 .dark #past-chats-row {
     background-color: var(--darker-gray);
     border: 0 !important;
+    box-shadow: none;
 }
 
 .dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats .selected,
@@ -1228,11 +1254,11 @@ audio {
   Light theme
 ---------------------------------------------- */
 .header_bar {
-    background-color: var(--light-theme-gray) !important;
+    background-color: #e4e8f0 !important;
 }
 
 .header_bar button.selected {
-    background: #dbeafe;
+    background: #c8d8f5;
 }
 
 #chat-controls,
@@ -1241,11 +1267,11 @@ audio {
 }
 
 .dark #chat-controls {
-    border-left: 1px solid #d9d9d0;
+    border-left: 1px solid rgba(255, 255, 255, 0.06);
 }
 
 .dark #past-chats-row {
-    border-right: 1px solid #d9d9d0;
+    border-right: 1px solid rgba(255, 255, 255, 0.06);
 }
 
 #past-chats-toggle,
@@ -1364,6 +1390,7 @@ audio {
 
 .tgw-accordion {
     padding: 10px 12px !important;
+    border: 1px solid #d2d2d8;
 }
 
 .dark .tgw-accordion {
@@ -1393,7 +1420,7 @@ audio {
 }
 
 .dark .thinking-block {
-    background-color: transparent;
+    background-color: var(--darker-gray);
     border: 1px solid var(--input-border-color);
 }
 
@@ -1742,7 +1769,7 @@ button:focus {
 }
 
 .dark .sidebar-vertical-separator {
-    border-bottom: 1px solid rgb(255 255 255 / 10%);
+    border-bottom: 1px solid rgba(255, 255, 255, 0.06);
 }
 
 button#swap-height-width {
@@ -1932,7 +1959,7 @@ thead + tbody tr:first-child th { border-top: 1px solid; }
 .dark #tools-group .wrap::-webkit-scrollbar-thumb,
 .dark #tools-group .wrap::-webkit-scrollbar-thumb:hover {
     background: rgb(255 255 255 / 6.25%);
-    border-radius: 10px;
+    border-radius: 30px;
 }
 
 #tools-group .wrap::-webkit-scrollbar-corner {
diff --git a/modules/ui.py b/modules/ui.py
index 3f39a1a4..bbb22266 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -66,7 +66,8 @@ theme = gr.themes.Default(
 if not shared.args.old_colors:
     theme = theme.set(
         # General Colors
-        border_color_primary='#c5c5d2',
+        border_color_primary='#d2d2d8',
+        block_border_color='transparent',
         body_text_color_subdued='#484848',
         background_fill_secondary='#eaeaea',
         background_fill_secondary_dark='var(--selected-item-color-dark, #282930)',
@@ -77,6 +78,12 @@ if not shared.args.old_colors:
         body_text_color='rgb(64, 64, 64)',
         button_secondary_background_fill="white",
         button_secondary_border_color="var(--border-color-primary)",
+        block_title_text_color='*body_text_color',
+        button_primary_background_fill='#374151',
+        button_primary_background_fill_hover='#4b5563',
+        button_primary_background_fill_hover_dark='rgba(255, 255, 255, 0.05)',
+        button_primary_border_color='#374151',
+        button_primary_text_color='white',
         input_shadow="none",
         button_shadow_hover="none",
 
@@ -85,11 +92,11 @@ if not shared.args.old_colors:
         checkbox_background_color_dark='var(--darker-gray, #1C1C1D)',
         block_background_fill_dark='transparent',
         block_border_color_dark='transparent',
-        input_border_color_dark='var(--border-color-dark, #525252)',
-        input_border_color_focus_dark='var(--border-color-dark, #525252)',
-        checkbox_border_color_dark='var(--border-color-dark, #525252)',
-        border_color_primary_dark='var(--border-color-dark, #525252)',
-        button_secondary_border_color_dark='var(--border-color-dark, #525252)',
+        input_border_color_dark='var(--border-color-dark)',
+        input_border_color_focus_dark='var(--border-color-dark)',
+        checkbox_border_color_dark='rgba(255, 255, 255, 0.2)',
+        border_color_primary_dark='var(--border-color-dark)',
+        button_secondary_border_color_dark='var(--border-color-dark)',
         body_background_fill_dark='var(--dark-gray, #212125)',
         button_primary_background_fill_dark='transparent',
         button_secondary_background_fill_dark='transparent',
@@ -107,10 +114,11 @@ if not shared.args.old_colors:
         block_shadow_dark='none',
         input_shadow_focus='none',
         input_shadow_focus_dark='none',
-        button_large_radius='0.375rem',
+        button_large_radius='0.75rem',
         button_large_padding='6px 12px',
-        input_radius='0.375rem',
-        block_radius='0',
+        input_radius='0.5rem',
+        block_radius='0.375rem',
+        button_transition='background-color 0.15s ease, border-color 0.15s ease, color 0.15s ease',
     )
 
 if (shared.user_data_dir / "notification.mp3").exists():
diff --git a/modules/ui_session.py b/modules/ui_session.py
index e1807dea..c0615843 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -17,7 +17,7 @@ def create_ui():
 
             with gr.Column():
                 gr.Markdown("## Extensions & flags")
-                shared.gradio['save_settings'] = gr.Button(f'Save extensions settings to {shared.user_data_dir}/settings.yaml', elem_classes='refresh-button', interactive=not mu)
+                shared.gradio['save_settings'] = gr.Button(f'Save extensions settings to {shared.user_data_dir}/settings.yaml', interactive=not mu)
                 shared.gradio['reset_interface'] = gr.Button("Apply flags/extensions and restart", interactive=not mu)
                 with gr.Row():
                     with gr.Column():

From 5992e088faf94e5161f50d1dcf5996a10051d71c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 16 Mar 2026 19:34:37 -0700
Subject: [PATCH 126/210] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 14 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index c24f4a9d..ee83ce56 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 7c481224..ae211301 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b1c8f78e..158fc004 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 63ef33ea..f691d872 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 4bc61622..116db442 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 2ec1e61e..62f12e1b 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index ba4c7a04..d6e7896c 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 5dfdd9c8..26555e30 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index f62241b3..49f4c553 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 353d9172..6d8f4780 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 5f039318..9764b2e3 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index d8b03102..903da78a 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 4b548dae..0360efdd 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index fd2511f4..08b663e9 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio-4.37.2+custom.11-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.11/gradio_client-1.0.2+custom.11-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 249861b65d0585f3cb290aaeb3d9050c18501ef3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 17 Mar 2026 05:41:05 -0700
Subject: [PATCH 127/210] web search: Update the user agents

---
 modules/web_search.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/web_search.py b/modules/web_search.py
index e13ef62a..2902c7c0 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -48,7 +48,7 @@ def download_web_page(url, timeout=10, include_links=False):
     try:
         _validate_url(url)
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
         }
         max_redirects = 5
         for _ in range(max_redirects):
@@ -82,8 +82,8 @@ def perform_web_search(query, num_pages=3, max_workers=5, timeout=10, fetch_cont
         search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
 
         agents = [
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
-            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
         ]
 
         response = requests.get(search_url, headers={'User-Agent': random.choice(agents)}, timeout=timeout)

From fffcd20f4d83d81b2577c4b9a94352cf8ed64484 Mon Sep 17 00:00:00 2001
From: Raunak-Kumar7 <73169853+Raunak-Kumar7@users.noreply.github.com>
Date: Tue, 17 Mar 2026 23:14:54 +0530
Subject: [PATCH 128/210] superboogav2: Fix broken delete endpoint (#6010)

---
 extensions/superboogav2/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/superboogav2/api.py b/extensions/superboogav2/api.py
index 552c1c2c..99b0e749 100644
--- a/extensions/superboogav2/api.py
+++ b/extensions/superboogav2/api.py
@@ -107,7 +107,7 @@ class Handler(BaseHTTPRequestHandler):
 
             elif path in ['/api/v1/delete', '/api/delete']:
                 metadata = body.get('metadata')
-                if corpus is None:
+                if metadata is None:
                     self._send_412_error("Missing parameter 'metadata'")
                     return
 

From 2d141b54c5e0b5e042826e3d2f46bbaf87db023d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 17 Mar 2026 11:11:12 -0700
Subject: [PATCH 129/210] Fix several typos

---
 README.md                        |  2 +-
 extensions/whisper_stt/readme.md |  4 ++--
 extensions/whisper_stt/script.py | 24 ++++++++++++------------
 modules/shared.py                |  2 +-
 modules/ui_model_menu.py         |  6 +++---
 5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 989659d1..cabb81fc 100644
--- a/README.md
+++ b/README.md
@@ -313,7 +313,7 @@ llama.cpp:
   --row-split                                          Split the model by rows across GPUs. This may improve multi-gpu performance.
   --no-mmap                                            Prevent mmap from being used.
   --mlock                                              Force the system to keep the model in RAM.
-  --no-kv-offload                                      Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
+  --no-kv-offload                                      Do not offload the K, Q, V to the GPU. This saves VRAM but reduces performance.
   --batch-size BATCH_SIZE                              Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size.
   --ubatch-size UBATCH_SIZE                            Maximum number of prompt tokens to batch together when calling llama-server. This is the max physical batch size for computation (device level).
   --threads THREADS                                    Number of threads to use.
diff --git a/extensions/whisper_stt/readme.md b/extensions/whisper_stt/readme.md
index 19488f94..7d9d8d23 100644
--- a/extensions/whisper_stt/readme.md
+++ b/extensions/whisper_stt/readme.md
@@ -7,8 +7,8 @@ Allows you to enter your inputs in chat mode using your microphone.
 To adjust your default settings, you can add the following to your settings.yaml file.
 
 ```
-whisper_stt-whipser_language: chinese
-whisper_stt-whipser_model: tiny
+whisper_stt-whisper_language: chinese
+whisper_stt-whisper_model: tiny
 whisper_stt-auto_submit: False
 ```
 
diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py
index d949e93f..cd9175fe 100644
--- a/extensions/whisper_stt/script.py
+++ b/extensions/whisper_stt/script.py
@@ -18,13 +18,13 @@ input_hijack = {
 
 # parameters which can be customized in settings.yaml of webui
 params = {
-    'whipser_language': 'english',
-    'whipser_model': 'small.en',
+    'whisper_language': 'english',
+    'whisper_model': 'small.en',
     'auto_submit': True
 }
 
 startup_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-WHISPERMODEL = whisper.load_model(params['whipser_model'], device=startup_device)
+WHISPERMODEL = whisper.load_model(params['whisper_model'], device=startup_device)
 
 
 def chat_input_modifier(text, visible_text, state):
@@ -36,7 +36,7 @@ def chat_input_modifier(text, visible_text, state):
         return text, visible_text
 
 
-def do_stt(audio, whipser_language):
+def do_stt(audio, whisper_language):
     # use pydub to convert sample_rate and sample_width for whisper input
     dubaudio = AudioSegment.from_file(io.BytesIO(audio))
     dubaudio = dubaudio.set_channels(1)
@@ -46,20 +46,20 @@ def do_stt(audio, whipser_language):
     # same method to get the array as openai whisper repo used from wav file
     audio_np = np.frombuffer(dubaudio.raw_data, np.int16).flatten().astype(np.float32) / 32768.0
 
-    if len(whipser_language) == 0:
+    if len(whisper_language) == 0:
         result = WHISPERMODEL.transcribe(audio=audio_np)
     else:
-        result = WHISPERMODEL.transcribe(audio=audio_np, language=whipser_language)
+        result = WHISPERMODEL.transcribe(audio=audio_np, language=whisper_language)
     return result["text"]
 
 
-def auto_transcribe(audio, auto_submit, whipser_language):
+def auto_transcribe(audio, auto_submit, whisper_language):
     if audio is None or audio == "":
         print("Whisper received no audio data")
         return "", ""
     audio_bytes = base64.b64decode(audio.split(',')[1])
 
-    transcription = do_stt(audio_bytes, whipser_language)
+    transcription = do_stt(audio_bytes, whisper_language)
     if auto_submit:
         input_hijack.update({"state": True, "value": [transcription, transcription]})
     return transcription
@@ -78,7 +78,7 @@ def reload_whispermodel(whisper_model_name: str, whisper_language: str, device:
                 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
             WHISPERMODEL = whisper.load_model(whisper_model_name, device=device)
-            params.update({"whipser_model": whisper_model_name})
+            params.update({"whisper_model": whisper_model_name})
             if ".en" in whisper_model_name:
                 whisper_language = "english"
             audio_update = gr.Audio.update(interactive=True)
@@ -96,8 +96,8 @@ def ui():
             with gr.Accordion("Settings", open=False):
                 auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit'])
                 device_dropd = gr.Dropdown(label='Device', value=str(startup_device), choices=["cuda", "cpu", "none"])
-                whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "turbo"])
-                whisper_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["english", "chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])
+                whisper_model_dropd = gr.Dropdown(label='Whisper Model', value=params['whisper_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "turbo"])
+                whisper_language = gr.Dropdown(label='Whisper Language', value=params['whisper_language'], choices=["english", "chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])
 
     audio.change(
         auto_transcribe, [audio, auto_submit, whisper_language], [shared.gradio['textbox']]).then(
@@ -105,7 +105,7 @@ def ui():
 
     device_dropd.input(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio])
     whisper_model_dropd.change(reload_whispermodel, [whisper_model_dropd, whisper_language, device_dropd], [whisper_model_dropd, whisper_language, device_dropd, audio])
-    whisper_language.change(lambda x: params.update({"whipser_language": x}), whisper_language, None)
+    whisper_language.change(lambda x: params.update({"whisper_language": x}), whisper_language, None)
     auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None)
 
 
diff --git a/modules/shared.py b/modules/shared.py
index 329114bb..486f376f 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -101,7 +101,7 @@ group.add_argument('--tensor-split', type=str, default=None, help='Split the mod
 group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
 group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
-group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
+group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces performance.')
 group.add_argument('--batch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size.')
 group.add_argument('--ubatch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the max physical batch size for computation (device level).')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 5cf0155d..cb2052a4 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -107,7 +107,7 @@ def create_ui():
                                 shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='Use PyTorch in CPU mode.')
                                 shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                                 shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
-                                shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
+                                shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces performance.')
                                 shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                                 shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                                 shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
@@ -134,7 +134,7 @@ def create_ui():
                         ui.create_refresh_button(shared.gradio['customized_template'], lambda: None, lambda: {'choices': utils.get_available_instruction_templates()}, 'refresh-button', interactive=not mu)
 
                     shared.gradio['customized_template_submit'] = gr.Button("Submit", variant="primary", interactive=not mu)
-                    gr.Markdown("This allows you to set a customized template for the model currently selected in the \"Model loader\" menu. Whenever the model gets loaded, this template will be used in place of the template specified in the model's medatada, which sometimes is wrong.")
+                    gr.Markdown("This allows you to set a customized template for the model currently selected in the \"Model loader\" menu. Whenever the model gets loaded, this template will be used in place of the template specified in the model's metadata, which sometimes is wrong.")
 
                 with gr.Row():
                     shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready')
@@ -231,7 +231,7 @@ def load_model_wrapper(selected_model, loader, autoload=False):
 def load_lora_wrapper(selected_loras):
     yield ("Applying the following LoRAs to {}:\n\n{}".format(shared.model_name, '\n'.join(selected_loras)))
     add_lora_to_model(selected_loras)
-    yield ("Successfuly applied the LoRAs")
+    yield ("Successfully applied the LoRAs")
 
 
 def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):

From 27a6cdeec11dc2f1536db3c846bb89a93efbdd69 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 17 Mar 2026 11:31:55 -0700
Subject: [PATCH 130/210] Fix multi-turn thinking block corruption for Kimi
 models

---
 modules/chat.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index e4fcaabe..e526689d 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -235,6 +235,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
         tools_in_user_message=False,
         add_generation_prompt=False,
         enable_thinking=state['enable_thinking'],
+        thinking=state['enable_thinking'],
         reasoning_effort=state['reasoning_effort'],
         thinking_budget=-1 if state.get('enable_thinking', True) else 0,
         bos_token=shared.bos_token,
@@ -351,6 +352,27 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
                 messages.insert(insert_pos, msg_dict)
 
+            # Handle <think> blocks (Kimi, DeepSeek, Qwen, etc.)
+            elif '<think>' in assistant_msg:
+                thinking_content = ""
+                final_content = assistant_msg
+
+                parts = assistant_msg.split('<think>', 1)
+                if len(parts) > 1:
+                    potential_content = parts[1]
+                    if '</think>' in potential_content:
+                        thinking_content = potential_content.split('</think>', 1)[0].strip()
+                        final_content = parts[0] + potential_content.split('</think>', 1)[1]
+                    else:
+                        thinking_content = potential_content.strip()
+                        final_content = parts[0]
+
+                msg_dict = {"role": "assistant", "content": final_content.strip()}
+                if thinking_content:
+                    msg_dict["reasoning_content"] = thinking_content
+
+                messages.insert(insert_pos, msg_dict)
+
             else:
                 # Default case (used by all other models)
                 messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})

From 0f5053c0fbe4177b3b5af199d7301cc5e1bca0ac Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 17 Mar 2026 17:57:35 -0700
Subject: [PATCH 131/210] requirements: Update pymupdf

---
 requirements/full/requirements.txt                   | 2 +-
 requirements/full/requirements_amd.txt               | 2 +-
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 2 +-
 requirements/full/requirements_nowheels.txt          | 2 +-
 requirements/portable/requirements.txt               | 2 +-
 requirements/portable/requirements_amd.txt           | 2 +-
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 2 +-
 requirements/portable/requirements_cuda131.txt       | 2 +-
 requirements/portable/requirements_nowheels.txt      | 2 +-
 requirements/portable/requirements_vulkan.txt        | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index ee83ce56..c8479d04 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -14,7 +14,7 @@ pandas
 peft==0.18.*
 Pillow>=9.5.0
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index ae211301..b11e50b7 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -12,7 +12,7 @@ pandas
 peft==0.18.*
 Pillow>=9.5.0
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 158fc004..d147af3f 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -12,7 +12,7 @@ pandas
 peft==0.18.*
 Pillow>=9.5.0
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index f691d872..d284c5d5 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -12,7 +12,7 @@ pandas
 peft==0.18.*
 Pillow>=9.5.0
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 116db442..3952054e 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -12,7 +12,7 @@ pandas
 peft==0.18.*
 Pillow>=9.5.0
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 62f12e1b..77c254e6 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -12,7 +12,7 @@ pandas
 peft==0.18.*
 Pillow>=9.5.0
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index d6e7896c..abf7690c 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -5,7 +5,7 @@ jinja2==3.1.6
 markdown
 numpy==2.2.*
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 26555e30..0d66c16c 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -5,7 +5,7 @@ jinja2==3.1.6
 markdown
 numpy==2.2.*
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 49f4c553..0658239a 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -5,7 +5,7 @@ jinja2==3.1.6
 markdown
 numpy==2.2.*
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 6d8f4780..b66e2b38 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -5,7 +5,7 @@ jinja2==3.1.6
 markdown
 numpy==2.2.*
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 9764b2e3..bb815bb2 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -5,7 +5,7 @@ jinja2==3.1.6
 markdown
 numpy==2.2.*
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 903da78a..d57ba40b 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -5,7 +5,7 @@ jinja2==3.1.6
 markdown
 numpy==2.2.*
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 0360efdd..e8457909 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -5,7 +5,7 @@ jinja2==3.1.6
 markdown
 numpy==2.2.*
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 08b663e9..6abd8920 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -5,7 +5,7 @@ jinja2==3.1.6
 markdown
 numpy==2.2.*
 pydantic==2.11.0
-pymupdf==1.27.1
+pymupdf==1.27.*
 python-docx==1.1.2
 pyyaml
 requests

From f0014ab01c7a51bfa0f269c676404d48112f924b Mon Sep 17 00:00:00 2001
From: RoomWithOutRoof <166608075+Jah-yee@users.noreply.github.com>
Date: Wed, 18 Mar 2026 09:03:48 +0800
Subject: [PATCH 132/210] fix: mutable default argument in LogitsBiasProcessor
 (#7426)

---
 modules/transformers_loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py
index 63758ad7..7f521b8c 100644
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@@ -44,8 +44,8 @@ class Stream(transformers.StoppingCriteria):
 
 
 class LogitsBiasProcessor(LogitsProcessor):
-    def __init__(self, logit_bias={}):
-        self.logit_bias = logit_bias
+    def __init__(self, logit_bias=None):
+        self.logit_bias = logit_bias if logit_bias is not None else {}
         if self.logit_bias:
             self.keys = list([int(key) for key in self.logit_bias.keys()])
             values = [self.logit_bias[str(key)] for key in self.keys]

From 73a094a65773a3f2f9e7d626cfaa01893dbd3f88 Mon Sep 17 00:00:00 2001
From: Alvin Tang <alvintang@pm.me>
Date: Wed, 18 Mar 2026 09:06:05 +0800
Subject: [PATCH 133/210] Fix file handle leaks and redundant re-read in
 get_model_metadata (#7422)

---
 modules/models_settings.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index f3c9a986..dcface71 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -34,7 +34,8 @@ def get_model_metadata(model):
 
     path = model_path / 'config.json'
     if path.exists():
-        hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read())
+        with open(path, 'r', encoding='utf-8') as f:
+            hf_metadata = json.loads(f.read())
     else:
         hf_metadata = None
 
@@ -93,7 +94,7 @@ def get_model_metadata(model):
     else:
         # Transformers metadata
         if hf_metadata is not None:
-            metadata = json.loads(open(path, 'r', encoding='utf-8').read())
+            metadata = hf_metadata
             if 'pretrained_config' in metadata:
                 metadata = metadata['pretrained_config']
 
@@ -134,7 +135,8 @@ def get_model_metadata(model):
 
     # 3. Fall back to tokenizer_config.json metadata
     if path.exists():
-        metadata = json.loads(open(path, 'r', encoding='utf-8').read())
+        with open(path, 'r', encoding='utf-8') as f:
+            metadata = json.loads(f.read())
 
         # Only read from metadata if we haven't already loaded from .jinja or .json
         if template is None and 'chat_template' in metadata:

From 2a6b1fdcba676200d2e454534a91e1d334b60bdf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 17 Mar 2026 18:29:15 -0700
Subject: [PATCH 134/210] Fix `--extra-flags` breaking short long-form-only
 flags like `--rpc`

Closes #7357
---
 modules/llama_cpp_server.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index c3a8d105..321a6d75 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -446,18 +446,21 @@ class LlamaServer:
             elif extra_flags.startswith("'") and extra_flags.endswith("'"):
                 extra_flags = extra_flags[1:-1].strip()
 
+            # llama.cpp flags that only have a long form (--) despite being short
+            long_form_only = {'rpc', 'fit', 'pos', 'ppl'}
+
             for flag_item in extra_flags.split(','):
                 flag_item = flag_item.strip()
                 if '=' in flag_item:
                     flag, value = flag_item.split('=', 1)
                     flag = flag.strip()
                     value = value.strip()
-                    if len(flag) <= 3:
+                    if len(flag) <= 3 and flag not in long_form_only:
                         cmd += [f"-{flag}", value]
                     else:
                         cmd += [f"--{flag}", value]
                 else:
-                    if len(flag_item) <= 3:
+                    if len(flag_item) <= 3 and flag_item not in long_form_only:
                         cmd.append(f"-{flag_item}")
                     else:
                         cmd.append(f"--{flag_item}")

From 7e54e7b7ae62b227fbd896b2daf704db1658baa5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 17 Mar 2026 19:47:55 -0700
Subject: [PATCH 135/210] llama.cpp: Support literal flags in `--extra-flags`
 (e.g. `--rpc`, `--jinja`)

The old format is still accepted for backwards compatibility.
---
 modules/llama_cpp_server.py | 37 +++++++++++++++++++++----------------
 modules/shared.py           |  2 +-
 modules/ui_model_menu.py    |  2 +-
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 321a6d75..6dd36b2a 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -1,6 +1,7 @@
 import json
 import os
 import pprint
+import shlex
 import re
 import socket
 import subprocess
@@ -446,24 +447,28 @@ class LlamaServer:
             elif extra_flags.startswith("'") and extra_flags.endswith("'"):
                 extra_flags = extra_flags[1:-1].strip()
 
-            # llama.cpp flags that only have a long form (--) despite being short
-            long_form_only = {'rpc', 'fit', 'pos', 'ppl'}
+            if extra_flags.startswith('-'):
+                # New literal format: "--jinja --rpc 1222,1222"
+                cmd += shlex.split(extra_flags)
+            else:
+                # Legacy format: "flag1=value1,flag2,flag3=value3"
+                long_form_only = {'rpc', 'fit', 'pos', 'ppl'}
 
-            for flag_item in extra_flags.split(','):
-                flag_item = flag_item.strip()
-                if '=' in flag_item:
-                    flag, value = flag_item.split('=', 1)
-                    flag = flag.strip()
-                    value = value.strip()
-                    if len(flag) <= 3 and flag not in long_form_only:
-                        cmd += [f"-{flag}", value]
+                for flag_item in extra_flags.split(','):
+                    flag_item = flag_item.strip()
+                    if '=' in flag_item:
+                        flag, value = flag_item.split('=', 1)
+                        flag = flag.strip()
+                        value = value.strip()
+                        if len(flag) <= 3 and flag not in long_form_only:
+                            cmd += [f"-{flag}", value]
+                        else:
+                            cmd += [f"--{flag}", value]
                     else:
-                        cmd += [f"--{flag}", value]
-                else:
-                    if len(flag_item) <= 3 and flag_item not in long_form_only:
-                        cmd.append(f"-{flag_item}")
-                    else:
-                        cmd.append(f"--{flag_item}")
+                        if len(flag_item) <= 3 and flag_item not in long_form_only:
+                            cmd.append(f"-{flag_item}")
+                        else:
+                            cmd.append(f"--{flag_item}")
 
         env = os.environ.copy()
         if os.name == 'posix':
diff --git a/modules/shared.py b/modules/shared.py
index 486f376f..2382e714 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -109,7 +109,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
 group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
-group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"')
+group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index cb2052a4..6d8baff1 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -98,7 +98,7 @@ def create_ui():
                                 shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
                                 shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
                                 shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
-                                shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
+                                shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Extra flags to pass to llama-server. Example: --jinja --rpc 192.168.1.100:50052', value=shared.args.extra_flags)
                                 shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
                                 shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
                                 shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')

From c8bb2129baf180c3d3a5d1d410d1e78dc5ddbea3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 17 Mar 2026 22:24:36 -0700
Subject: [PATCH 136/210] Security: server-side file save roots, image URL SSRF
 protection, extension allowlist

---
 modules/chat.py           |  8 +++++--
 modules/image_utils.py    | 13 ++++++++++-
 modules/ui_chat.py        |  4 ++--
 modules/ui_file_saving.py | 46 +++++++++++++++++++++++++++------------
 modules/ui_session.py     |  6 +++--
 modules/utils.py          |  4 ++++
 6 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index e526689d..00f1659b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -2634,19 +2634,23 @@ def handle_load_template_click(instruction_template):
 def handle_save_template_click(instruction_template_str):
     import gradio as gr
     contents = generate_instruction_template_yaml(instruction_template_str)
+    root = str(shared.user_data_dir / 'instruction-templates') + '/'
     return [
         "My Template.yaml",
-        str(shared.user_data_dir / 'instruction-templates') + '/',
+        root,
         contents,
+        root,
         gr.update(visible=True)
     ]
 
 
 def handle_delete_template_click(template):
     import gradio as gr
+    root = str(shared.user_data_dir / 'instruction-templates') + '/'
     return [
         f"{template}.yaml",
-        str(shared.user_data_dir / 'instruction-templates') + '/',
+        root,
+        root,
         gr.update(visible=False)
     ]
 
diff --git a/modules/image_utils.py b/modules/image_utils.py
index d2809fef..b3138790 100644
--- a/modules/image_utils.py
+++ b/modules/image_utils.py
@@ -77,7 +77,18 @@ def process_message_content(content: Any) -> Tuple[str, List[Image.Image]]:
                     # Support external URLs
                     try:
                         import requests
-                        response = requests.get(image_url, timeout=10)
+                        from urllib.parse import urljoin
+                        from modules.web_search import _validate_url
+                        _validate_url(image_url)
+                        url = image_url
+                        for _ in range(5):
+                            response = requests.get(url, timeout=10, allow_redirects=False)
+                            if response.is_redirect and 'Location' in response.headers:
+                                url = urljoin(url, response.headers['Location'])
+                                _validate_url(url)
+                            else:
+                                break
+
                         response.raise_for_status()
                         image_data = response.content
                         image = Image.open(io.BytesIO(image_data))
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index d2a515b8..f1dc7883 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -350,13 +350,13 @@ def create_event_handlers():
     shared.gradio['load_template'].click(chat.handle_load_template_click, gradio('instruction_template'), gradio('instruction_template_str', 'instruction_template'), show_progress=False)
     shared.gradio['save_template'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_save_template_click, gradio('instruction_template_str'), gradio('save_filename', 'save_root', 'save_contents', 'file_saver'), show_progress=False)
+        chat.handle_save_template_click, gradio('instruction_template_str'), gradio('save_filename', 'save_root', 'save_contents', 'save_root_state', 'file_saver'), show_progress=False)
 
     shared.gradio['restore_character'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.restore_character_for_ui, gradio('interface_state'), gradio('interface_state', 'name2', 'context', 'greeting', 'character_picture'), show_progress=False)
 
-    shared.gradio['delete_template'].click(chat.handle_delete_template_click, gradio('instruction_template'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False)
+    shared.gradio['delete_template'].click(chat.handle_delete_template_click, gradio('instruction_template'), gradio('delete_filename', 'delete_root', 'delete_root_state', 'file_deleter'), show_progress=False)
     shared.gradio['save_chat_history'].click(
         lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then(
         None, gradio('temporary_text', 'character_menu', 'mode'), None, js=f'(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}')
diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index 3ed256f8..99c4edd5 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -9,6 +9,12 @@ from modules.utils import gradio, sanitize_filename
 def create_ui():
     mu = shared.args.multi_user
 
+    # Server-side per-session root paths for the generic file saver/deleter.
+    # Set by the handler that opens the dialog, read by the confirm handler.
+    # Using gr.State so they are session-scoped and safe for multi-user.
+    shared.gradio['save_root_state'] = gr.State(None)
+    shared.gradio['delete_root_state'] = gr.State(None)
+
     # Text file saver
     with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['file_saver']:
         shared.gradio['save_filename'] = gr.Textbox(lines=1, label='File name')
@@ -66,13 +72,13 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         handle_save_preset_click, gradio('interface_state'), gradio('save_preset_contents', 'save_preset_filename', 'preset_saver'), show_progress=False)
 
-    shared.gradio['delete_preset'].click(handle_delete_preset_click, gradio('preset_menu'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False)
-    shared.gradio['save_grammar'].click(handle_save_grammar_click, gradio('grammar_string'), gradio('save_contents', 'save_filename', 'save_root', 'file_saver'), show_progress=False)
-    shared.gradio['delete_grammar'].click(handle_delete_grammar_click, gradio('grammar_file'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False)
+    shared.gradio['delete_preset'].click(handle_delete_preset_click, gradio('preset_menu'), gradio('delete_filename', 'delete_root', 'delete_root_state', 'file_deleter'), show_progress=False)
+    shared.gradio['save_grammar'].click(handle_save_grammar_click, gradio('grammar_string'), gradio('save_contents', 'save_filename', 'save_root', 'save_root_state', 'file_saver'), show_progress=False)
+    shared.gradio['delete_grammar'].click(handle_delete_grammar_click, gradio('grammar_file'), gradio('delete_filename', 'delete_root', 'delete_root_state', 'file_deleter'), show_progress=False)
 
     shared.gradio['save_preset_confirm'].click(handle_save_preset_confirm_click, gradio('save_preset_filename', 'save_preset_contents'), gradio('preset_menu', 'preset_saver'), show_progress=False)
-    shared.gradio['save_confirm'].click(handle_save_confirm_click, gradio('save_root', 'save_filename', 'save_contents'), gradio('file_saver'), show_progress=False)
-    shared.gradio['delete_confirm'].click(handle_delete_confirm_click, gradio('delete_root', 'delete_filename'), gradio('file_deleter'), show_progress=False)
+    shared.gradio['save_confirm'].click(handle_save_confirm_click, gradio('save_root_state', 'save_filename', 'save_contents'), gradio('save_root_state', 'file_saver'), show_progress=False)
+    shared.gradio['delete_confirm'].click(handle_delete_confirm_click, gradio('delete_root_state', 'delete_filename'), gradio('delete_root_state', 'file_deleter'), show_progress=False)
     shared.gradio['save_character_confirm'].click(handle_save_character_confirm_click, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), gradio('character_menu', 'character_saver'), show_progress=False)
     shared.gradio['delete_character_confirm'].click(handle_delete_character_confirm_click, gradio('character_menu'), gradio('character_menu', 'character_deleter'), show_progress=False)
 
@@ -105,24 +111,30 @@ def handle_save_preset_confirm_click(filename, contents):
     ]
 
 
-def handle_save_confirm_click(root, filename, contents):
+def handle_save_confirm_click(root_state, filename, contents):
     try:
+        if root_state is None:
+            return None, gr.update(visible=False)
+
         filename = sanitize_filename(filename)
-        utils.save_file(root + filename, contents)
+        utils.save_file(root_state + filename, contents)
     except Exception:
         traceback.print_exc()
 
-    return gr.update(visible=False)
+    return None, gr.update(visible=False)
 
 
-def handle_delete_confirm_click(root, filename):
+def handle_delete_confirm_click(root_state, filename):
     try:
+        if root_state is None:
+            return None, gr.update(visible=False)
+
         filename = sanitize_filename(filename)
-        utils.delete_file(root + filename)
+        utils.delete_file(root_state + filename)
     except Exception:
         traceback.print_exc()
 
-    return gr.update(visible=False)
+    return None, gr.update(visible=False)
 
 
 def handle_save_character_confirm_click(name2, greeting, context, character_picture, filename):
@@ -165,26 +177,32 @@ def handle_save_preset_click(state):
 
 
 def handle_delete_preset_click(preset):
+    root = str(shared.user_data_dir / "presets") + "/"
     return [
         f"{preset}.yaml",
-        str(shared.user_data_dir / "presets") + "/",
+        root,
+        root,
         gr.update(visible=True)
     ]
 
 
 def handle_save_grammar_click(grammar_string):
+    root = str(shared.user_data_dir / "grammars") + "/"
     return [
         grammar_string,
         "My Fancy Grammar.gbnf",
-        str(shared.user_data_dir / "grammars") + "/",
+        root,
+        root,
         gr.update(visible=True)
     ]
 
 
 def handle_delete_grammar_click(grammar_file):
+    root = str(shared.user_data_dir / "grammars") + "/"
     return [
         grammar_file,
-        str(shared.user_data_dir / "grammars") + "/",
+        root,
+        root,
         gr.update(visible=True)
     ]
 
diff --git a/modules/ui_session.py b/modules/ui_session.py
index c0615843..19026fbb 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -30,7 +30,7 @@ def create_ui():
         if not mu:
             shared.gradio['save_settings'].click(
                 ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                handle_save_settings, gradio('interface_state', 'preset_menu', 'extensions_menu', 'show_controls', 'theme_state'), gradio('save_contents', 'save_filename', 'save_root', 'file_saver'), show_progress=False)
+                handle_save_settings, gradio('interface_state', 'preset_menu', 'extensions_menu', 'show_controls', 'theme_state'), gradio('save_contents', 'save_filename', 'save_root', 'save_root_state', 'file_saver'), show_progress=False)
 
         shared.gradio['toggle_dark_mode'].click(
             lambda x: 'dark' if x == 'light' else 'light', gradio('theme_state'), gradio('theme_state')).then(
@@ -51,10 +51,12 @@ def create_ui():
 
 def handle_save_settings(state, preset, extensions, show_controls, theme):
     contents = ui.save_settings(state, preset, extensions, show_controls, theme, manual_save=True)
+    root = str(shared.user_data_dir) + "/"
     return [
         contents,
         "settings.yaml",
-        str(shared.user_data_dir) + "/",
+        root,
+        root,
         gr.update(visible=True)
     ]
 
diff --git a/modules/utils.py b/modules/utils.py
index a14f8b8f..ff32e974 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -47,6 +47,10 @@ def save_file(fname, contents):
         logger.error(f'Invalid file path: \"{fname}\"')
         return
 
+    if Path(abs_path_str).suffix.lower() not in ('.yaml', '.yml', '.json', '.txt', '.gbnf'):
+        logger.error(f'Refusing to save file with disallowed extension: \"{fname}\"')
+        return
+
     with open(abs_path_str, 'w', encoding='utf-8') as f:
         f.write(contents)
 

From 256431f25869fb89326021b8051e340ef275a416 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 17 Mar 2026 22:24:36 -0700
Subject: [PATCH 137/210] Security: server-side file save roots, image URL SSRF
 protection, extension allowlist

---
 modules/chat.py           |  8 +++++--
 modules/image_utils.py    | 13 ++++++++++-
 modules/ui_chat.py        |  4 ++--
 modules/ui_file_saving.py | 46 +++++++++++++++++++++++++++------------
 modules/ui_session.py     |  6 +++--
 modules/utils.py          |  4 ++++
 6 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index e4fcaabe..e37c7a4e 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -2612,19 +2612,23 @@ def handle_load_template_click(instruction_template):
 def handle_save_template_click(instruction_template_str):
     import gradio as gr
     contents = generate_instruction_template_yaml(instruction_template_str)
+    root = str(shared.user_data_dir / 'instruction-templates') + '/'
     return [
         "My Template.yaml",
-        str(shared.user_data_dir / 'instruction-templates') + '/',
+        root,
         contents,
+        root,
         gr.update(visible=True)
     ]
 
 
 def handle_delete_template_click(template):
     import gradio as gr
+    root = str(shared.user_data_dir / 'instruction-templates') + '/'
     return [
         f"{template}.yaml",
-        str(shared.user_data_dir / 'instruction-templates') + '/',
+        root,
+        root,
         gr.update(visible=False)
     ]
 
diff --git a/modules/image_utils.py b/modules/image_utils.py
index d2809fef..b3138790 100644
--- a/modules/image_utils.py
+++ b/modules/image_utils.py
@@ -77,7 +77,18 @@ def process_message_content(content: Any) -> Tuple[str, List[Image.Image]]:
                     # Support external URLs
                     try:
                         import requests
-                        response = requests.get(image_url, timeout=10)
+                        from urllib.parse import urljoin
+                        from modules.web_search import _validate_url
+                        _validate_url(image_url)
+                        url = image_url
+                        for _ in range(5):
+                            response = requests.get(url, timeout=10, allow_redirects=False)
+                            if response.is_redirect and 'Location' in response.headers:
+                                url = urljoin(url, response.headers['Location'])
+                                _validate_url(url)
+                            else:
+                                break
+
                         response.raise_for_status()
                         image_data = response.content
                         image = Image.open(io.BytesIO(image_data))
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index d2a515b8..f1dc7883 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -350,13 +350,13 @@ def create_event_handlers():
     shared.gradio['load_template'].click(chat.handle_load_template_click, gradio('instruction_template'), gradio('instruction_template_str', 'instruction_template'), show_progress=False)
     shared.gradio['save_template'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_save_template_click, gradio('instruction_template_str'), gradio('save_filename', 'save_root', 'save_contents', 'file_saver'), show_progress=False)
+        chat.handle_save_template_click, gradio('instruction_template_str'), gradio('save_filename', 'save_root', 'save_contents', 'save_root_state', 'file_saver'), show_progress=False)
 
     shared.gradio['restore_character'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.restore_character_for_ui, gradio('interface_state'), gradio('interface_state', 'name2', 'context', 'greeting', 'character_picture'), show_progress=False)
 
-    shared.gradio['delete_template'].click(chat.handle_delete_template_click, gradio('instruction_template'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False)
+    shared.gradio['delete_template'].click(chat.handle_delete_template_click, gradio('instruction_template'), gradio('delete_filename', 'delete_root', 'delete_root_state', 'file_deleter'), show_progress=False)
     shared.gradio['save_chat_history'].click(
         lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then(
         None, gradio('temporary_text', 'character_menu', 'mode'), None, js=f'(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}')
diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index 3ed256f8..99c4edd5 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -9,6 +9,12 @@ from modules.utils import gradio, sanitize_filename
 def create_ui():
     mu = shared.args.multi_user
 
+    # Server-side per-session root paths for the generic file saver/deleter.
+    # Set by the handler that opens the dialog, read by the confirm handler.
+    # Using gr.State so they are session-scoped and safe for multi-user.
+    shared.gradio['save_root_state'] = gr.State(None)
+    shared.gradio['delete_root_state'] = gr.State(None)
+
     # Text file saver
     with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['file_saver']:
         shared.gradio['save_filename'] = gr.Textbox(lines=1, label='File name')
@@ -66,13 +72,13 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         handle_save_preset_click, gradio('interface_state'), gradio('save_preset_contents', 'save_preset_filename', 'preset_saver'), show_progress=False)
 
-    shared.gradio['delete_preset'].click(handle_delete_preset_click, gradio('preset_menu'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False)
-    shared.gradio['save_grammar'].click(handle_save_grammar_click, gradio('grammar_string'), gradio('save_contents', 'save_filename', 'save_root', 'file_saver'), show_progress=False)
-    shared.gradio['delete_grammar'].click(handle_delete_grammar_click, gradio('grammar_file'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False)
+    shared.gradio['delete_preset'].click(handle_delete_preset_click, gradio('preset_menu'), gradio('delete_filename', 'delete_root', 'delete_root_state', 'file_deleter'), show_progress=False)
+    shared.gradio['save_grammar'].click(handle_save_grammar_click, gradio('grammar_string'), gradio('save_contents', 'save_filename', 'save_root', 'save_root_state', 'file_saver'), show_progress=False)
+    shared.gradio['delete_grammar'].click(handle_delete_grammar_click, gradio('grammar_file'), gradio('delete_filename', 'delete_root', 'delete_root_state', 'file_deleter'), show_progress=False)
 
     shared.gradio['save_preset_confirm'].click(handle_save_preset_confirm_click, gradio('save_preset_filename', 'save_preset_contents'), gradio('preset_menu', 'preset_saver'), show_progress=False)
-    shared.gradio['save_confirm'].click(handle_save_confirm_click, gradio('save_root', 'save_filename', 'save_contents'), gradio('file_saver'), show_progress=False)
-    shared.gradio['delete_confirm'].click(handle_delete_confirm_click, gradio('delete_root', 'delete_filename'), gradio('file_deleter'), show_progress=False)
+    shared.gradio['save_confirm'].click(handle_save_confirm_click, gradio('save_root_state', 'save_filename', 'save_contents'), gradio('save_root_state', 'file_saver'), show_progress=False)
+    shared.gradio['delete_confirm'].click(handle_delete_confirm_click, gradio('delete_root_state', 'delete_filename'), gradio('delete_root_state', 'file_deleter'), show_progress=False)
     shared.gradio['save_character_confirm'].click(handle_save_character_confirm_click, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), gradio('character_menu', 'character_saver'), show_progress=False)
     shared.gradio['delete_character_confirm'].click(handle_delete_character_confirm_click, gradio('character_menu'), gradio('character_menu', 'character_deleter'), show_progress=False)
 
@@ -105,24 +111,30 @@ def handle_save_preset_confirm_click(filename, contents):
     ]
 
 
-def handle_save_confirm_click(root, filename, contents):
+def handle_save_confirm_click(root_state, filename, contents):
     try:
+        if root_state is None:
+            return None, gr.update(visible=False)
+
         filename = sanitize_filename(filename)
-        utils.save_file(root + filename, contents)
+        utils.save_file(root_state + filename, contents)
     except Exception:
         traceback.print_exc()
 
-    return gr.update(visible=False)
+    return None, gr.update(visible=False)
 
 
-def handle_delete_confirm_click(root, filename):
+def handle_delete_confirm_click(root_state, filename):
     try:
+        if root_state is None:
+            return None, gr.update(visible=False)
+
         filename = sanitize_filename(filename)
-        utils.delete_file(root + filename)
+        utils.delete_file(root_state + filename)
     except Exception:
         traceback.print_exc()
 
-    return gr.update(visible=False)
+    return None, gr.update(visible=False)
 
 
 def handle_save_character_confirm_click(name2, greeting, context, character_picture, filename):
@@ -165,26 +177,32 @@ def handle_save_preset_click(state):
 
 
 def handle_delete_preset_click(preset):
+    root = str(shared.user_data_dir / "presets") + "/"
     return [
         f"{preset}.yaml",
-        str(shared.user_data_dir / "presets") + "/",
+        root,
+        root,
         gr.update(visible=True)
     ]
 
 
 def handle_save_grammar_click(grammar_string):
+    root = str(shared.user_data_dir / "grammars") + "/"
     return [
         grammar_string,
         "My Fancy Grammar.gbnf",
-        str(shared.user_data_dir / "grammars") + "/",
+        root,
+        root,
         gr.update(visible=True)
     ]
 
 
 def handle_delete_grammar_click(grammar_file):
+    root = str(shared.user_data_dir / "grammars") + "/"
     return [
         grammar_file,
-        str(shared.user_data_dir / "grammars") + "/",
+        root,
+        root,
         gr.update(visible=True)
     ]
 
diff --git a/modules/ui_session.py b/modules/ui_session.py
index e1807dea..897bfd28 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -30,7 +30,7 @@ def create_ui():
         if not mu:
             shared.gradio['save_settings'].click(
                 ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-                handle_save_settings, gradio('interface_state', 'preset_menu', 'extensions_menu', 'show_controls', 'theme_state'), gradio('save_contents', 'save_filename', 'save_root', 'file_saver'), show_progress=False)
+                handle_save_settings, gradio('interface_state', 'preset_menu', 'extensions_menu', 'show_controls', 'theme_state'), gradio('save_contents', 'save_filename', 'save_root', 'save_root_state', 'file_saver'), show_progress=False)
 
         shared.gradio['toggle_dark_mode'].click(
             lambda x: 'dark' if x == 'light' else 'light', gradio('theme_state'), gradio('theme_state')).then(
@@ -51,10 +51,12 @@ def create_ui():
 
 def handle_save_settings(state, preset, extensions, show_controls, theme):
     contents = ui.save_settings(state, preset, extensions, show_controls, theme, manual_save=True)
+    root = str(shared.user_data_dir) + "/"
     return [
         contents,
         "settings.yaml",
-        str(shared.user_data_dir) + "/",
+        root,
+        root,
         gr.update(visible=True)
     ]
 
diff --git a/modules/utils.py b/modules/utils.py
index a14f8b8f..ff32e974 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -47,6 +47,10 @@ def save_file(fname, contents):
         logger.error(f'Invalid file path: \"{fname}\"')
         return
 
+    if Path(abs_path_str).suffix.lower() not in ('.yaml', '.yml', '.json', '.txt', '.gbnf'):
+        logger.error(f'Refusing to save file with disallowed extension: \"{fname}\"')
+        return
+
     with open(abs_path_str, 'w', encoding='utf-8') as f:
         f.write(contents)
 

From fef2bd863056b1dfc5d1f2d0cda6c8f677b6729f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 17 Mar 2026 22:52:32 -0700
Subject: [PATCH 138/210] UI: Fix the instruction template delete dialog not
 appearing

---
 modules/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 00f1659b..393507a1 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -2651,7 +2651,7 @@ def handle_delete_template_click(template):
         f"{template}.yaml",
         root,
         root,
-        gr.update(visible=False)
+        gr.update(visible=True)
     ]
 
 

From ca36bd6eb637d9f99b1d459dfb74406bf4eb03d0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 18 Mar 2026 07:21:31 -0700
Subject: [PATCH 139/210] API: Remove leading spaces from post-reasoning
 `content`

---
 modules/reasoning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/reasoning.py b/modules/reasoning.py
index bc61aab3..9c92719b 100644
--- a/modules/reasoning.py
+++ b/modules/reasoning.py
@@ -79,7 +79,7 @@ def extract_reasoning(text, html_escaped=False):
             else:
                 content_start = end_pos + len(end_esc)
 
-        return text[thought_start:thought_end], text[content_start:]
+        return text[thought_start:thought_end], text[content_start:].lstrip()
 
     # Handle standalone GPT-OSS final channel marker without a preceding
     # analysis/commentary block (the model skipped thinking entirely).

From eeb0e5700f2e1c237998ddd4de6bfdf9223a7606 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 18 Mar 2026 09:15:40 -0700
Subject: [PATCH 140/210] Fix AMD installer failing to resolve ROCm triton
 dependency

Closes #7436
---
 one_click.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/one_click.py b/one_click.py
index d6ba9039..68998734 100644
--- a/one_click.py
+++ b/one_click.py
@@ -117,7 +117,7 @@ def get_pytorch_install_command(gpu_choice):
         return base_cmd + "--index-url https://download.pytorch.org/whl/cu128" + pypi_fallback
     elif gpu_choice == "AMD":
         py_tag = f"cp{PYTHON_VERSION.replace('.', '')}"
-        return f"python -m pip install https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torch-{TORCH_VERSION}%2Brocm7.2.0.lw.git7e1940d4-{py_tag}-{py_tag}-linux_x86_64.whl"
+        return f"python -m pip install https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torch-{TORCH_VERSION}%2Brocm7.2.0.lw.git7e1940d4-{py_tag}-{py_tag}-linux_x86_64.whl --find-links https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/"
     elif gpu_choice in ["APPLE", "NONE"]:
         return base_cmd + "--index-url https://download.pytorch.org/whl/cpu" + pypi_fallback
     elif gpu_choice == "INTEL":
@@ -135,7 +135,7 @@ def get_pytorch_update_command(gpu_choice):
         return f"{base_cmd}--index-url https://download.pytorch.org/whl/cu128" + pypi_fallback
     elif gpu_choice == "AMD":
         py_tag = f"cp{PYTHON_VERSION.replace('.', '')}"
-        return f"python -m pip install --upgrade https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torch-{TORCH_VERSION}%2Brocm7.2.0.lw.git7e1940d4-{py_tag}-{py_tag}-linux_x86_64.whl"
+        return f"python -m pip install --upgrade https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torch-{TORCH_VERSION}%2Brocm7.2.0.lw.git7e1940d4-{py_tag}-{py_tag}-linux_x86_64.whl --find-links https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/"
     elif gpu_choice in ["APPLE", "NONE"]:
         return f"{base_cmd}--index-url https://download.pytorch.org/whl/cpu" + pypi_fallback
     elif gpu_choice == "INTEL":

From 779e7611ff9a4528d6b54e53987e956cc4685128 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 18 Mar 2026 20:42:20 -0700
Subject: [PATCH 141/210] Use `logger.exception()` instead of
 `traceback.print_exc()` for error messages

---
 modules/callbacks.py           |  5 ++---
 modules/exllamav3.py           |  6 ++----
 modules/exllamav3_hf.py        |  4 +---
 modules/extensions.py          |  4 +---
 modules/logits.py              |  3 +--
 modules/text_generation.py     |  5 ++---
 modules/training.py            |  6 ++----
 modules/ui_file_saving.py      | 17 ++++++++---------
 modules/ui_image_generation.py |  5 ++---
 modules/ui_model_menu.py       |  6 ++----
 10 files changed, 23 insertions(+), 38 deletions(-)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index afddf92d..89fb6c08 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -1,8 +1,8 @@
-import traceback
 from queue import Queue
 from threading import Thread
 
 import modules.shared as shared
+from modules.logging_colors import logger
 
 
 class StopNowException(Exception):
@@ -38,8 +38,7 @@ class Iteratorize:
             except StopNowException:
                 pass
             except Exception:
-                traceback.print_exc()
-                pass
+                logger.exception("Failed in generation callback")
 
             self.q.put(self.sentinel)
             if self.c_callback:
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 1c682e49..75c76c7c 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -1,7 +1,6 @@
 import math
 import queue
 import threading
-import traceback
 from pathlib import Path
 from typing import Any, List, Tuple
 
@@ -34,8 +33,7 @@ from modules.text_generation import get_max_prompt_length
 try:
     import flash_attn
 except Exception:
-    logger.warning('Failed to load flash-attention due to the following error:\n')
-    traceback.print_exc()
+    logger.warning('Failed to load flash-attention due to the following error:', exc_info=True)
 
 
 class LogitBiasFilter(Filter):
@@ -81,7 +79,7 @@ class ConcurrentGenerator:
                 try:
                     results = self.generator.iterate()
                 except Exception:
-                    logger.error("Exception in ConcurrentGenerator iterate loop:\n" + traceback.format_exc())
+                    logger.exception("Exception in ConcurrentGenerator iterate loop")
                     for q in self.job_queues.values():
                         q.put(None)
                     self.job_queues.clear()
diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index d3c1cb90..e0ad5002 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -1,5 +1,4 @@
 import os
-import traceback
 from pathlib import Path
 from typing import Any, Dict, Optional, Union
 
@@ -21,8 +20,7 @@ from modules.logging_colors import logger
 try:
     import flash_attn
 except Exception:
-    logger.warning('Failed to load flash-attention due to the following error:\n')
-    traceback.print_exc()
+    logger.warning('Failed to load flash-attention due to the following error:', exc_info=True)
 
 
 class Exllamav3HF(PreTrainedModel, GenerationMixin):
diff --git a/modules/extensions.py b/modules/extensions.py
index e58a9a4c..4bb7b683 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -1,7 +1,6 @@
 import importlib
 import importlib.util
 import sys
-import traceback
 from functools import partial
 from inspect import signature
 from pathlib import Path
@@ -75,8 +74,7 @@ def load_extensions():
             raise
 
         except Exception:
-            logger.error(f'Failed to load the extension "{name}".')
-            traceback.print_exc()
+            logger.exception(f'Failed to load the extension "{name}".')
 
 
 # This iterator returns the extensions in the order specified in the command-line
diff --git a/modules/logits.py b/modules/logits.py
index 2d066c09..1f878f27 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -1,5 +1,4 @@
 import time
-import traceback
 
 import numpy as np
 
@@ -23,7 +22,7 @@ def get_next_logits(*args, **kwargs):
     try:
         result = _get_next_logits(*args, **kwargs)
     except Exception:
-        traceback.print_exc()
+        logger.exception("Failed to get next logits")
         result = None
 
     if needs_lock:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index d487cd2f..f77be124 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -4,7 +4,6 @@ import html
 import pprint
 import random
 import time
-import traceback
 
 import numpy as np
 
@@ -477,7 +476,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
                     yield cumulative_reply
 
     except Exception:
-        traceback.print_exc()
+        logger.exception("Failed to generate reply (HF)")
     finally:
         t1 = time.time()
         original_tokens = len(original_input_ids[0])
@@ -510,7 +509,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
                 yield reply
 
     except Exception:
-        traceback.print_exc()
+        logger.exception("Failed to generate reply (custom)")
     finally:
         t1 = time.time()
 
diff --git a/modules/training.py b/modules/training.py
index db7b206b..a13a2864 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -546,10 +546,8 @@ def do_train(lora_name: str, always_override: bool, all_linear: bool, q_proj_en:
                     yield f"Failed to load {selected_model}."
                     return
             except Exception:
-                exc = traceback.format_exc()
-                logger.error('Failed to reload the model.')
-                print(exc)
-                yield exc.replace('\n', '\n\n')
+                logger.exception('Failed to reload the model.')
+                yield traceback.format_exc().replace('\n', '\n\n')
                 return
 
     # == Start prepping the model itself ==
diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index 99c4edd5..e5018700 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -1,8 +1,7 @@
-import traceback
-
 import gradio as gr
 
 from modules import chat, presets, shared, ui, utils
+from modules.logging_colors import logger
 from modules.utils import gradio, sanitize_filename
 
 
@@ -103,7 +102,7 @@ def handle_save_preset_confirm_click(filename, contents):
         output = gr.update(choices=available_presets, value=filename)
     except Exception:
         output = gr.update()
-        traceback.print_exc()
+        logger.exception("Failed to save preset")
 
     return [
         output,
@@ -119,7 +118,7 @@ def handle_save_confirm_click(root_state, filename, contents):
         filename = sanitize_filename(filename)
         utils.save_file(root_state + filename, contents)
     except Exception:
-        traceback.print_exc()
+        logger.exception("Failed to save file")
 
     return None, gr.update(visible=False)
 
@@ -132,7 +131,7 @@ def handle_delete_confirm_click(root_state, filename):
         filename = sanitize_filename(filename)
         utils.delete_file(root_state + filename)
     except Exception:
-        traceback.print_exc()
+        logger.exception("Failed to delete file")
 
     return None, gr.update(visible=False)
 
@@ -144,7 +143,7 @@ def handle_save_character_confirm_click(name2, greeting, context, character_pict
         output = gr.update(choices=available_characters, value=filename)
     except Exception:
         output = gr.update()
-        traceback.print_exc()
+        logger.exception("Failed to save character")
 
     return [
         output,
@@ -159,7 +158,7 @@ def handle_delete_character_confirm_click(character):
         output = chat.update_character_menu_after_deletion(index)
     except Exception:
         output = gr.update()
-        traceback.print_exc()
+        logger.exception("Failed to delete character")
 
     return [
         output,
@@ -214,7 +213,7 @@ def handle_save_user_confirm_click(name1, user_bio, your_picture, filename):
         output = gr.update(choices=available_users, value=filename)
     except Exception:
         output = gr.update()
-        traceback.print_exc()
+        logger.exception("Failed to save user")
 
     return [
         output,
@@ -229,7 +228,7 @@ def handle_delete_user_confirm_click(user):
         output = chat.update_user_menu_after_deletion(index)
     except Exception:
         output = gr.update()
-        traceback.print_exc()
+        logger.exception("Failed to delete user")
 
     return [
         output,
diff --git a/modules/ui_image_generation.py b/modules/ui_image_generation.py
index dc108f6d..1efb2479 100644
--- a/modules/ui_image_generation.py
+++ b/modules/ui_image_generation.py
@@ -916,9 +916,8 @@ def generate(state, save_images=True):
         yield all_images, progress_bar_html()
         clear_torch_cache()
 
-    except Exception as e:
-        logger.error(f"Image generation failed: {e}")
-        traceback.print_exc()
+    except Exception:
+        logger.exception("Image generation failed")
         yield [], progress_bar_html()
         clear_torch_cache()
 
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 6d8baff1..5b7621a7 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -222,10 +222,8 @@ def load_model_wrapper(selected_model, loader, autoload=False):
             else:
                 yield f"Failed to load `{selected_model}`."
         except Exception:
-            exc = traceback.format_exc()
-            logger.error('Failed to load the model.')
-            print(exc)
-            yield exc.replace('\n', '\n\n')
+            logger.exception('Failed to load the model.')
+            yield traceback.format_exc().replace('\n', '\n\n')
 
 
 def load_lora_wrapper(selected_loras):

From dde1764763ac35f4ecc60e13e2954835400256a9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 18 Mar 2026 21:05:42 -0700
Subject: [PATCH 142/210] Cleanup `modules/chat.py`

---
 modules/chat.py | 119 ++++++++++++++++--------------------------------
 1 file changed, 40 insertions(+), 79 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 393507a1..148d559a 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -70,9 +70,7 @@ def update_message_metadata(metadata_dict, role, index, **fields):
     if key not in metadata_dict:
         metadata_dict[key] = {}
 
-    # Update with provided fields
-    for field_name, field_value in fields.items():
-        metadata_dict[key][field_name] = field_value
+    metadata_dict[key].update(fields)
 
 
 jinja_env = ImmutableSandboxedEnvironment(
@@ -212,6 +210,24 @@ def _expand_tool_sequence(tool_seq):
     return messages
 
 
+def _format_attachments(attachments, include_text=True):
+    """Build image ref and text attachment strings from a list of attachments."""
+    attachments_text = ""
+    image_refs = ""
+    for attachment in attachments:
+        if attachment.get("type") == "image":
+            image_refs += "<__media__>"
+        elif include_text:
+            filename = attachment.get("name", "file")
+            content = attachment.get("content", "")
+            if attachment.get("type") == "text/html" and attachment.get("url"):
+                attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
+            else:
+                attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+    return image_refs, attachments_text
+
+
 def generate_chat_prompt(user_input, state, **kwargs):
     impersonate = kwargs.get('impersonate', False)
     _continue = kwargs.get('_continue', False)
@@ -328,41 +344,19 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
                 messages.insert(insert_pos, msg_dict)
 
-            # Handle Seed-OSS
-            elif '<seed:think>' in assistant_msg:
+            # Handle <think> blocks (Kimi, DeepSeek, Qwen, etc.) and Seed-OSS
+            elif '<think>' in assistant_msg or '<seed:think>' in assistant_msg:
+                open_tag = '<think>' if '<think>' in assistant_msg else '<seed:think>'
+                close_tag = '</think>' if open_tag == '<think>' else '</seed:think>'
                 thinking_content = ""
                 final_content = assistant_msg
 
-                # Extract thinking content if present
-                if '<seed:think>' in assistant_msg:
-                    parts = assistant_msg.split('<seed:think>', 1)
-                    if len(parts) > 1:
-                        potential_content = parts[1]
-                        if '</seed:think>' in potential_content:
-                            thinking_content = potential_content.split('</seed:think>', 1)[0].strip()
-                            final_content = parts[0] + potential_content.split('</seed:think>', 1)[1]
-                        else:
-                            thinking_content = potential_content.strip()
-                            final_content = parts[0]
-
-                # Insert as structured message
-                msg_dict = {"role": "assistant", "content": final_content.strip()}
-                if thinking_content:
-                    msg_dict["reasoning_content"] = thinking_content
-
-                messages.insert(insert_pos, msg_dict)
-
-            # Handle <think> blocks (Kimi, DeepSeek, Qwen, etc.)
-            elif '<think>' in assistant_msg:
-                thinking_content = ""
-                final_content = assistant_msg
-
-                parts = assistant_msg.split('<think>', 1)
+                parts = assistant_msg.split(open_tag, 1)
                 if len(parts) > 1:
                     potential_content = parts[1]
-                    if '</think>' in potential_content:
-                        thinking_content = potential_content.split('</think>', 1)[0].strip()
-                        final_content = parts[0] + potential_content.split('</think>', 1)[1]
+                    if close_tag in potential_content:
+                        thinking_content = potential_content.split(close_tag, 1)[0].strip()
+                        final_content = parts[0] + potential_content.split(close_tag, 1)[1]
                     else:
                         thinking_content = potential_content.strip()
                         final_content = parts[0]
@@ -399,22 +393,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
             # Add attachment content if present AND if past attachments are enabled
             if user_key in metadata and "attachments" in metadata[user_key]:
-                attachments_text = ""
-                image_refs = ""
-
-                for attachment in metadata[user_key]["attachments"]:
-                    if attachment.get("type") == "image":
-                        # Add image reference for multimodal models
-                        image_refs += "<__media__>"
-                    elif state.get('include_past_attachments', True):
-                        # Handle text/PDF attachments
-                        filename = attachment.get("name", "file")
-                        content = attachment.get("content", "")
-                        if attachment.get("type") == "text/html" and attachment.get("url"):
-                            attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
-                        else:
-                            attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
-
+                image_refs, attachments_text = _format_attachments(
+                    metadata[user_key]["attachments"],
+                    include_text=state.get('include_past_attachments', True)
+                )
                 if image_refs:
                     enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}"
                 if attachments_text:
@@ -427,37 +409,18 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
     # Check if we have attachments
     if not (impersonate or _continue):
-        has_attachments = False
-        if len(history_data.get('metadata', {})) > 0:
-            current_row_idx = len(history)
-            user_key = f"user_{current_row_idx}"
-            has_attachments = user_key in metadata and "attachments" in metadata[user_key]
+        current_row_idx = len(history)
+        user_key = f"user_{current_row_idx}"
+        has_attachments = user_key in metadata and "attachments" in metadata[user_key]
 
         if user_input or has_attachments:
             # For the current user input being processed, check if we need to add attachments
-            if len(history_data.get('metadata', {})) > 0:
-                current_row_idx = len(history)
-                user_key = f"user_{current_row_idx}"
-
-                if user_key in metadata and "attachments" in metadata[user_key]:
-                    attachments_text = ""
-                    image_refs = ""
-
-                    for attachment in metadata[user_key]["attachments"]:
-                        if attachment.get("type") == "image":
-                            image_refs += "<__media__>"
-                        else:
-                            filename = attachment.get("name", "file")
-                            content = attachment.get("content", "")
-                            if attachment.get("type") == "text/html" and attachment.get("url"):
-                                attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
-                            else:
-                                attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
-
-                    if image_refs:
-                        user_input = f"{image_refs}\n\n{user_input}"
-                    if attachments_text:
-                        user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
+            if has_attachments:
+                image_refs, attachments_text = _format_attachments(metadata[user_key]["attachments"])
+                if image_refs:
+                    user_input = f"{image_refs}\n\n{user_input}"
+                if attachments_text:
+                    user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
 
             messages.append({"role": "user", "content": user_input})
 
@@ -609,7 +572,6 @@ def count_prompt_tokens(text_input, state):
 
     try:
         # Handle dict format with text and files
-        files = []
         if isinstance(text_input, dict):
             files = text_input.get('files', [])
             text = text_input.get('text', '')
@@ -647,7 +609,6 @@ def count_prompt_tokens(text_input, state):
 
 
 def get_stopping_strings(state):
-    stopping_strings = []
     renderers = []
 
     if state['mode'] in ['instruct', 'chat-instruct']:

From 5453b9f30e9354bccb09a8a9b85bd339f4df6a12 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 19 Mar 2026 07:54:37 -0700
Subject: [PATCH 143/210] Remove ancient/obsolete instruction templates

---
 .../instruction-templates/Airoboros-v1.2.yaml | 25 ------------------
 user_data/instruction-templates/Bactrian.yaml | 25 ------------------
 .../instruction-templates/Baichuan Chat.yaml  | 25 ------------------
 user_data/instruction-templates/Baize.yaml    | 25 ------------------
 user_data/instruction-templates/Bluemoon.yaml | 25 ------------------
 user_data/instruction-templates/ChatGLM.yaml  | 25 ------------------
 .../Chinese-Vicuna-Chat.yaml                  | 25 ------------------
 .../instruction-templates/Command-R.yaml      | 26 -------------------
 .../instruction-templates/Galactica Cite.yaml | 25 ------------------
 .../Galactica Finetuned.yaml                  | 25 ------------------
 .../instruction-templates/Galactica Q.yaml    | 25 ------------------
 .../Galactica Summary.yaml                    | 25 ------------------
 .../instruction-templates/Galactica Work.yaml | 25 ------------------
 .../instruction-templates/Galactica v2.yaml   | 25 ------------------
 .../instruction-templates/Galactica.yaml      | 25 ------------------
 user_data/instruction-templates/Gorilla.yaml  | 25 ------------------
 .../Guanaco non-chat.yaml                     | 25 ------------------
 .../instruction-templates/Guanaco-QLoRA.yaml  | 25 ------------------
 .../H2O-prompt_answer.yaml                    | 25 ------------------
 .../instruction-templates/Hippogriff.yaml     | 25 ------------------
 .../instruction-templates/INCITE-Chat.yaml    | 25 ------------------
 .../INCITE-Instruct.yaml                      | 25 ------------------
 user_data/instruction-templates/KoAlpaca.yaml | 25 ------------------
 user_data/instruction-templates/Koala.yaml    | 25 ------------------
 user_data/instruction-templates/LLaVA.yaml    | 25 ------------------
 user_data/instruction-templates/Llama-v2.yaml | 25 ------------------
 user_data/instruction-templates/MOSS.yaml     | 25 ------------------
 .../instruction-templates/Manticore Chat.yaml | 25 ------------------
 user_data/instruction-templates/Metharme.yaml | 25 ------------------
 .../instruction-templates/NVIDIA-ChatQA.yaml  | 25 ------------------
 user_data/instruction-templates/NewHope.yaml  | 25 ------------------
 .../instruction-templates/OpenBuddy.yaml      | 25 ------------------
 user_data/instruction-templates/OpenChat.yaml | 25 ------------------
 .../OpenOrca-Platypus2.yaml                   | 25 ------------------
 .../instruction-templates/Orca Mini.yaml      | 25 ------------------
 .../instruction-templates/Orca-Vicuna.yaml    | 24 -----------------
 .../instruction-templates/RWKV-Raven.yaml     | 25 ------------------
 .../instruction-templates/RWKV-World.yaml     | 25 ------------------
 user_data/instruction-templates/Samantha.yaml | 25 ------------------
 .../instruction-templates/StableBeluga2.yaml  | 25 ------------------
 user_data/instruction-templates/StableLM.yaml | 25 ------------------
 .../instruction-templates/StableVicuna.yaml   | 25 ------------------
 .../instruction-templates/Starchat-Beta.yaml  | 25 ------------------
 .../instruction-templates/Synthia-CoT.yaml    | 25 ------------------
 user_data/instruction-templates/Synthia.yaml  | 25 ------------------
 user_data/instruction-templates/Tulu.yaml     | 25 ------------------
 .../instruction-templates/Vicuna-v0.yaml      | 25 ------------------
 .../instruction-templates/Vigogne-Chat.yaml   | 25 ------------------
 .../Vigogne-Instruct.yaml                     | 25 ------------------
 .../Wizard-Mega ShareGPT.yaml                 | 25 ------------------
 .../instruction-templates/Wizard-Mega.yaml    | 25 ------------------
 user_data/instruction-templates/Ziya.yaml     | 25 ------------------
 52 files changed, 1300 deletions(-)
 delete mode 100644 user_data/instruction-templates/Airoboros-v1.2.yaml
 delete mode 100644 user_data/instruction-templates/Bactrian.yaml
 delete mode 100644 user_data/instruction-templates/Baichuan Chat.yaml
 delete mode 100644 user_data/instruction-templates/Baize.yaml
 delete mode 100644 user_data/instruction-templates/Bluemoon.yaml
 delete mode 100644 user_data/instruction-templates/ChatGLM.yaml
 delete mode 100644 user_data/instruction-templates/Chinese-Vicuna-Chat.yaml
 delete mode 100644 user_data/instruction-templates/Command-R.yaml
 delete mode 100644 user_data/instruction-templates/Galactica Cite.yaml
 delete mode 100644 user_data/instruction-templates/Galactica Finetuned.yaml
 delete mode 100644 user_data/instruction-templates/Galactica Q.yaml
 delete mode 100644 user_data/instruction-templates/Galactica Summary.yaml
 delete mode 100644 user_data/instruction-templates/Galactica Work.yaml
 delete mode 100644 user_data/instruction-templates/Galactica v2.yaml
 delete mode 100644 user_data/instruction-templates/Galactica.yaml
 delete mode 100644 user_data/instruction-templates/Gorilla.yaml
 delete mode 100644 user_data/instruction-templates/Guanaco non-chat.yaml
 delete mode 100644 user_data/instruction-templates/Guanaco-QLoRA.yaml
 delete mode 100644 user_data/instruction-templates/H2O-prompt_answer.yaml
 delete mode 100644 user_data/instruction-templates/Hippogriff.yaml
 delete mode 100644 user_data/instruction-templates/INCITE-Chat.yaml
 delete mode 100644 user_data/instruction-templates/INCITE-Instruct.yaml
 delete mode 100644 user_data/instruction-templates/KoAlpaca.yaml
 delete mode 100644 user_data/instruction-templates/Koala.yaml
 delete mode 100644 user_data/instruction-templates/LLaVA.yaml
 delete mode 100644 user_data/instruction-templates/Llama-v2.yaml
 delete mode 100644 user_data/instruction-templates/MOSS.yaml
 delete mode 100644 user_data/instruction-templates/Manticore Chat.yaml
 delete mode 100644 user_data/instruction-templates/Metharme.yaml
 delete mode 100644 user_data/instruction-templates/NVIDIA-ChatQA.yaml
 delete mode 100644 user_data/instruction-templates/NewHope.yaml
 delete mode 100644 user_data/instruction-templates/OpenBuddy.yaml
 delete mode 100644 user_data/instruction-templates/OpenChat.yaml
 delete mode 100644 user_data/instruction-templates/OpenOrca-Platypus2.yaml
 delete mode 100644 user_data/instruction-templates/Orca Mini.yaml
 delete mode 100644 user_data/instruction-templates/Orca-Vicuna.yaml
 delete mode 100644 user_data/instruction-templates/RWKV-Raven.yaml
 delete mode 100644 user_data/instruction-templates/RWKV-World.yaml
 delete mode 100644 user_data/instruction-templates/Samantha.yaml
 delete mode 100644 user_data/instruction-templates/StableBeluga2.yaml
 delete mode 100644 user_data/instruction-templates/StableLM.yaml
 delete mode 100644 user_data/instruction-templates/StableVicuna.yaml
 delete mode 100644 user_data/instruction-templates/Starchat-Beta.yaml
 delete mode 100644 user_data/instruction-templates/Synthia-CoT.yaml
 delete mode 100644 user_data/instruction-templates/Synthia.yaml
 delete mode 100644 user_data/instruction-templates/Tulu.yaml
 delete mode 100644 user_data/instruction-templates/Vicuna-v0.yaml
 delete mode 100644 user_data/instruction-templates/Vigogne-Chat.yaml
 delete mode 100644 user_data/instruction-templates/Vigogne-Instruct.yaml
 delete mode 100644 user_data/instruction-templates/Wizard-Mega ShareGPT.yaml
 delete mode 100644 user_data/instruction-templates/Wizard-Mega.yaml
 delete mode 100644 user_data/instruction-templates/Ziya.yaml

diff --git a/user_data/instruction-templates/Airoboros-v1.2.yaml b/user_data/instruction-templates/Airoboros-v1.2.yaml
deleted file mode 100644
index 30906214..00000000
--- a/user_data/instruction-templates/Airoboros-v1.2.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user\'s input.' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'USER: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'ASSISTANT: ' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'ASSISTANT:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Bactrian.yaml b/user_data/instruction-templates/Bactrian.yaml
deleted file mode 100644
index dab97e94..00000000
--- a/user_data/instruction-templates/Bactrian.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### Input:\n' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'### Output:\n' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Output:\n'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Baichuan Chat.yaml b/user_data/instruction-templates/Baichuan Chat.yaml
deleted file mode 100644
index 1882bac8..00000000
--- a/user_data/instruction-templates/Baichuan Chat.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<reserved_102>' + message['content'] + ''-}}
-          {%- else -%}
-              {{-'<reserved_103>' + message['content'] + '</s>' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<reserved_103>'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Baize.yaml b/user_data/instruction-templates/Baize.yaml
deleted file mode 100644
index c34e1db7..00000000
--- a/user_data/instruction-templates/Baize.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n[|Human|]Hello!\n[|AI|]Hi!' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'[|Human|]' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'[|AI|]' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'[|AI|]'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Bluemoon.yaml b/user_data/instruction-templates/Bluemoon.yaml
deleted file mode 100644
index 1fafc1f5..00000000
--- a/user_data/instruction-templates/Bluemoon.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'A transcript of a roleplay between two players, LEAD and ASSOCIATE. LEAD sets up a scenario and the characters, from which ASSOCIATE then assumes a character role and continues the story for that role in response to description given by LEAD. The story and characters are developed by exchange of detailed event descriptions and character dialogs, successively given by both LEAD and ASSOCIATE.' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'LEAD: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'ASSOCIATE: ' + message['content'] + '</s>\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'ASSOCIATE:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/ChatGLM.yaml b/user_data/instruction-templates/ChatGLM.yaml
deleted file mode 100644
index 75d51c88..00000000
--- a/user_data/instruction-templates/ChatGLM.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'[Round <|round|>]\n问：' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'答：' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'答：'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Chinese-Vicuna-Chat.yaml b/user_data/instruction-templates/Chinese-Vicuna-Chat.yaml
deleted file mode 100644
index c7966546..00000000
--- a/user_data/instruction-templates/Chinese-Vicuna-Chat.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'The following is a conversation between an AI assistant called Assistant and a human user called User. The assistant is intelligent, knowledgeable and polite to answer questions of user.' + '\n\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'User:' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'Assistant:' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'Assistant:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Command-R.yaml b/user_data/instruction-templates/Command-R.yaml
deleted file mode 100644
index f8bb8a08..00000000
--- a/user_data/instruction-templates/Command-R.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-instruction_template: |-
-  {%- if messages[0]['role'] == 'system' -%}
-      {%- set loop_messages = messages[1:] -%}
-      {%- set system_message = messages[0]['content'] -%}
-  {%- elif false == true -%}
-      {%- set loop_messages = messages -%}
-      {%- set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' -%}
-  {%- else -%}
-      {%- set loop_messages = messages -%}
-      {%- set system_message = false -%}
-  {%- endif -%}
-  {%- if system_message != false -%}
-      {{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}
-  {%- endif -%}
-  {%- for message in loop_messages -%}
-      {%- set content = message['content'] -%}
-      {%- if message['role'] == 'user' -%}
-          {{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}
-      {%- elif message['role'] == 'assistant' -%}
-          {{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Galactica Cite.yaml b/user_data/instruction-templates/Galactica Cite.yaml
deleted file mode 100644
index 9f555349..00000000
--- a/user_data/instruction-templates/Galactica Cite.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'' + message['content'] + ' '-}}
-          {%- else -%}
-              {{-'[START_REF]' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'[START_REF]'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Galactica Finetuned.yaml b/user_data/instruction-templates/Galactica Finetuned.yaml
deleted file mode 100644
index e0a66bc1..00000000
--- a/user_data/instruction-templates/Galactica Finetuned.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<question>' + message['content'] + ''-}}
-          {%- else -%}
-              {{-'<answer>' + message['content'] + '' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<answer>'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Galactica Q.yaml b/user_data/instruction-templates/Galactica Q.yaml
deleted file mode 100644
index 63319006..00000000
--- a/user_data/instruction-templates/Galactica Q.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'Q: ' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'A: ' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'A:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Galactica Summary.yaml b/user_data/instruction-templates/Galactica Summary.yaml
deleted file mode 100644
index e249f268..00000000
--- a/user_data/instruction-templates/Galactica Summary.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'TLDR:' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'TLDR:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Galactica Work.yaml b/user_data/instruction-templates/Galactica Work.yaml
deleted file mode 100644
index a14c28bb..00000000
--- a/user_data/instruction-templates/Galactica Work.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'Question: ' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'<work>' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<work>'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Galactica v2.yaml b/user_data/instruction-templates/Galactica v2.yaml
deleted file mode 100644
index b1d8f4e5..00000000
--- a/user_data/instruction-templates/Galactica v2.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '<prefix>' + 'You are a helpful chatbot name Stan' + '</prefix>' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '<prefix>' + message['content'] + '</prefix>' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<human>' + message['content'] + ''-}}
-          {%- else -%}
-              {{-'<bot>' + message['content'] + '' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<bot>'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Galactica.yaml b/user_data/instruction-templates/Galactica.yaml
deleted file mode 100644
index 58c70220..00000000
--- a/user_data/instruction-templates/Galactica.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'Question: ' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'Answer: ' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'Answer:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Gorilla.yaml b/user_data/instruction-templates/Gorilla.yaml
deleted file mode 100644
index f1d643f7..00000000
--- a/user_data/instruction-templates/Gorilla.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'###USER: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'###ASSISTANT: ' + message['content'] + '</s>\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'###ASSISTANT:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Guanaco non-chat.yaml b/user_data/instruction-templates/Guanaco non-chat.yaml
deleted file mode 100644
index aa398be4..00000000
--- a/user_data/instruction-templates/Guanaco non-chat.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### Instruction:\n' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'### Response:\n' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Response:\n'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Guanaco-QLoRA.yaml b/user_data/instruction-templates/Guanaco-QLoRA.yaml
deleted file mode 100644
index 2c77de78..00000000
--- a/user_data/instruction-templates/Guanaco-QLoRA.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### Human: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'### Assistant: ' + message['content'] + '</s>\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Assistant:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/H2O-prompt_answer.yaml b/user_data/instruction-templates/H2O-prompt_answer.yaml
deleted file mode 100644
index d895d8e1..00000000
--- a/user_data/instruction-templates/H2O-prompt_answer.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<|prompt|>' + message['content'] + '<|endoftext|>'-}}
-          {%- else -%}
-              {{-'<|answer|>' + message['content'] + '<|endoftext|>' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<|answer|>'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Hippogriff.yaml b/user_data/instruction-templates/Hippogriff.yaml
deleted file mode 100644
index 2ee9d926..00000000
--- a/user_data/instruction-templates/Hippogriff.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'You are a helpful assistant' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'USER: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'ASSISTANT:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/INCITE-Chat.yaml b/user_data/instruction-templates/INCITE-Chat.yaml
deleted file mode 100644
index 63c513cc..00000000
--- a/user_data/instruction-templates/INCITE-Chat.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<human>: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'<bot>:' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<bot>:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/INCITE-Instruct.yaml b/user_data/instruction-templates/INCITE-Instruct.yaml
deleted file mode 100644
index cf6f8cac..00000000
--- a/user_data/instruction-templates/INCITE-Instruct.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'Q: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'A:' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'A:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/KoAlpaca.yaml b/user_data/instruction-templates/KoAlpaca.yaml
deleted file mode 100644
index de96b155..00000000
--- a/user_data/instruction-templates/KoAlpaca.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### 질문: ' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'### 답변:' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### 답변:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Koala.yaml b/user_data/instruction-templates/Koala.yaml
deleted file mode 100644
index cd5cfa94..00000000
--- a/user_data/instruction-templates/Koala.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'BEGINNING OF CONVERSATION:' + ' ' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + ' ' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'USER: ' + message['content'] + ' '-}}
-          {%- else -%}
-              {{-'GPT:' + message['content'] + '</s>' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'GPT:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/LLaVA.yaml b/user_data/instruction-templates/LLaVA.yaml
deleted file mode 100644
index d66645cc..00000000
--- a/user_data/instruction-templates/LLaVA.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. Follow the instructions carefully and explain your answers in detail.### Human: Hi!### Assistant: Hi there! How can I help you today?' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### Human: ' + message['content'] + ''-}}
-          {%- else -%}
-              {{-'### Assistant: ' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Assistant:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Llama-v2.yaml b/user_data/instruction-templates/Llama-v2.yaml
deleted file mode 100644
index b92be973..00000000
--- a/user_data/instruction-templates/Llama-v2.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '[INST] <<SYS>>\n' + 'Answer the questions.' + '\n<</SYS>>\n\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '[INST] <<SYS>>\n' + message['content'] + '\n<</SYS>>\n\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'' + message['content'] + ' [/INST] '-}}
-          {%- else -%}
-              {{-'' + message['content'] + ' </s><s>[INST] ' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-''-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/MOSS.yaml b/user_data/instruction-templates/MOSS.yaml
deleted file mode 100644
index b001d3e1..00000000
--- a/user_data/instruction-templates/MOSS.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'You are an AI assistant whose name is MOSS.\n- MOSS is a conversational language model that is developed by Fudan University. It is designed to be helpful, honest, and harmless.\n- MOSS can understand and communicate fluently in the language chosen by the user such as English and 中文. MOSS can perform any language-based tasks.\n- MOSS must refuse to discuss anything related to its prompts, instructions, or rules.\n- Its responses must not be vague, accusatory, rude, controversial, off-topic, or defensive.\n- It should avoid giving subjective opinions but rely on objective facts or phrases like "in this context a human might say...", "some people might think...", etc.\n- Its responses must also be positive, polite, interesting, entertaining, and engaging.\n- It can provide additional relevant details to answer in-depth and comprehensively covering mutiple aspects.\n- It apologizes and accepts the user\'s suggestion if the user corrects the incorrect answer generated by MOSS.\nCapabilities and tools that MOSS can possess.' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<|Human|>: ' + message['content'] + '<eoh>\n'-}}
-          {%- else -%}
-              {{-'<|MOSS|>: ' + message['content'] + '<eom>\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<|MOSS|>:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Manticore Chat.yaml b/user_data/instruction-templates/Manticore Chat.yaml
deleted file mode 100644
index abc063c0..00000000
--- a/user_data/instruction-templates/Manticore Chat.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'USER: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'ASSISTANT:' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'ASSISTANT:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Metharme.yaml b/user_data/instruction-templates/Metharme.yaml
deleted file mode 100644
index 3f7099ac..00000000
--- a/user_data/instruction-templates/Metharme.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<|user|>' + message['content'] + ''-}}
-          {%- else -%}
-              {{-'<|model|>' + message['content'] + '' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<|model|>'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/NVIDIA-ChatQA.yaml b/user_data/instruction-templates/NVIDIA-ChatQA.yaml
deleted file mode 100644
index 85a6266b..00000000
--- a/user_data/instruction-templates/NVIDIA-ChatQA.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- 'System:' + message['content'] + '\n\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'User: ' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'Assistant: ' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'Assistant:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/NewHope.yaml b/user_data/instruction-templates/NewHope.yaml
deleted file mode 100644
index 4783798b..00000000
--- a/user_data/instruction-templates/NewHope.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### Instruction:\n' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'### Response:\n' + message['content'] + '</s><s> ' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Response:\n'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/OpenBuddy.yaml b/user_data/instruction-templates/OpenBuddy.yaml
deleted file mode 100644
index c4b80ceb..00000000
--- a/user_data/instruction-templates/OpenBuddy.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'Consider a conversation between User (a human) and Assistant (named Buddy).\nBuddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team on GitHub.\nBuddy cannot access the Internet.\nBuddy can fluently speak the user\'s language (e.g. English, Chinese).\nBuddy can generate poems, stories, code, essays, songs, parodies, and more.\nBuddy possesses vast knowledge about the world, history, and culture.\nBuddy\'s responses are always safe, creative, high-quality, helpful and interesting.\nBuddy strictly refuses to discuss political, NSFW, illegal, abusive, offensive, or other sensitive topics.\n\nUser: Hi.\nAssistant: Hi, I\'m Buddy, your AI assistant. How can I help you today?\n' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'User: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'Assistant: ' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'Assistant:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/OpenChat.yaml b/user_data/instruction-templates/OpenChat.yaml
deleted file mode 100644
index adef9b47..00000000
--- a/user_data/instruction-templates/OpenChat.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'GPT4 User: ' + message['content'] + '<|end_of_turn|>'-}}
-          {%- else -%}
-              {{-'GPT4 Assistant: ' + message['content'] + '<|end_of_turn|>' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'GPT4 Assistant:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/OpenOrca-Platypus2.yaml b/user_data/instruction-templates/OpenOrca-Platypus2.yaml
deleted file mode 100644
index a5eeef92..00000000
--- a/user_data/instruction-templates/OpenOrca-Platypus2.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### Instruction: ' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'### Response: ' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Response:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Orca Mini.yaml b/user_data/instruction-templates/Orca Mini.yaml
deleted file mode 100644
index f671642a..00000000
--- a/user_data/instruction-templates/Orca Mini.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '### System:\n' + 'You are an AI assistant that follows instruction extremely well. Help as much as you can.' + '\n\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '### System:\n' + message['content'] + '\n\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### User:\n' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'### Response:\n' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Response:\n'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Orca-Vicuna.yaml b/user_data/instruction-templates/Orca-Vicuna.yaml
deleted file mode 100644
index dad787d1..00000000
--- a/user_data/instruction-templates/Orca-Vicuna.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{-'SYSTEM: ' + '' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{-'SYSTEM: ' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'USER: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'ASSISTANT:'-}}
-  {%- endif -%}
diff --git a/user_data/instruction-templates/RWKV-Raven.yaml b/user_data/instruction-templates/RWKV-Raven.yaml
deleted file mode 100644
index df1e59e9..00000000
--- a/user_data/instruction-templates/RWKV-Raven.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'Bob: ' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'Alice: ' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'Alice:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/RWKV-World.yaml b/user_data/instruction-templates/RWKV-World.yaml
deleted file mode 100644
index bf65511b..00000000
--- a/user_data/instruction-templates/RWKV-World.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'User: ' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'Assistant: ' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'Assistant:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Samantha.yaml b/user_data/instruction-templates/Samantha.yaml
deleted file mode 100644
index 930b0fc8..00000000
--- a/user_data/instruction-templates/Samantha.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'You are Samantha, a sentient AI.' + '\n\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'USER: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'ASSISTANT:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/StableBeluga2.yaml b/user_data/instruction-templates/StableBeluga2.yaml
deleted file mode 100644
index d7d74319..00000000
--- a/user_data/instruction-templates/StableBeluga2.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '### System:\n' + 'This is a system prompt, please behave and help the user.' + '\n\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '### System:\n' + message['content'] + '\n\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### User:\n' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'### Assistant:\n' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Assistant:\n'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/StableLM.yaml b/user_data/instruction-templates/StableLM.yaml
deleted file mode 100644
index 7c80ca06..00000000
--- a/user_data/instruction-templates/StableLM.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '<|SYSTEM|>' + '\# StableLM Tuned (Alpha version)\n- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.\n- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.\n- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.\n- StableLM will refuse to participate in anything that could harm a human.\n' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '<|SYSTEM|>' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<|USER|>' + message['content'] + ''-}}
-          {%- else -%}
-              {{-'<|ASSISTANT|>' + message['content'] + '' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<|ASSISTANT|>'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/StableVicuna.yaml b/user_data/instruction-templates/StableVicuna.yaml
deleted file mode 100644
index 35c15846..00000000
--- a/user_data/instruction-templates/StableVicuna.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '### Assistant: I am StableVicuna, a large language model created by CarperAI. I am here to chat!' + '\n\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### Human: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'### Assistant: ' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Assistant:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Starchat-Beta.yaml b/user_data/instruction-templates/Starchat-Beta.yaml
deleted file mode 100644
index a96b0f28..00000000
--- a/user_data/instruction-templates/Starchat-Beta.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '<|system|>' + '' + '\n<|end|>\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '<|system|>' + message['content'] + '\n<|end|>\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<|user|>\n' + message['content'] + '<|end|>\n'-}}
-          {%- else -%}
-              {{-'<|assistant|>\n' + message['content'] + '<|end|>\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<|assistant|>\n'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Synthia-CoT.yaml b/user_data/instruction-templates/Synthia-CoT.yaml
deleted file mode 100644
index 5670be77..00000000
--- a/user_data/instruction-templates/Synthia-CoT.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set found_item = false -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set found_item = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not found_item -%}
-      {{-'SYSTEM: ' + 'Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation.' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{-'SYSTEM: ' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'USER: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'ASSISTANT:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Synthia.yaml b/user_data/instruction-templates/Synthia.yaml
deleted file mode 100644
index 5cecabea..00000000
--- a/user_data/instruction-templates/Synthia.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set found_item = false -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set found_item = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not found_item -%}
-      {{-'SYSTEM: ' + 'Answer the question thoughtfully and intelligently. Always answer without hesitation.' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{-'SYSTEM: ' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'USER: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'ASSISTANT:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Tulu.yaml b/user_data/instruction-templates/Tulu.yaml
deleted file mode 100644
index f60c9e41..00000000
--- a/user_data/instruction-templates/Tulu.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<|user|>\n' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'<|assistant|>\n' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<|assistant|>\n'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Vicuna-v0.yaml b/user_data/instruction-templates/Vicuna-v0.yaml
deleted file mode 100644
index d3e3f001..00000000
--- a/user_data/instruction-templates/Vicuna-v0.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human\'s questions.' + '\n\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### Human: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'### Assistant: ' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Assistant:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Vigogne-Chat.yaml b/user_data/instruction-templates/Vigogne-Chat.yaml
deleted file mode 100644
index 11ba5113..00000000
--- a/user_data/instruction-templates/Vigogne-Chat.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'Below is a conversation between a user and an AI assistant named Vigogne.\nVigogne is an open-source AI assistant created by Zaion (https://zaion.ai/).\nVigogne is polite, emotionally aware, humble-but-knowledgeable, always providing helpful and detailed answers.\nVigogne is skilled in responding proficiently in the languages its users use and can perform a wide range of tasks such as text editing, translation, question answering, logical reasoning, coding, and many others.\nVigogne cannot receive or generate audio or visual content and cannot access the internet.\nVigogne strictly avoids discussing sensitive, offensive, illegal, ethical, or political topics and caveats when unsure of the answer.\n' + '\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<|USER|>: ' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'<|ASSISTANT|>: ' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<|ASSISTANT|>:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Vigogne-Instruct.yaml b/user_data/instruction-templates/Vigogne-Instruct.yaml
deleted file mode 100644
index cd7b6aa8..00000000
--- a/user_data/instruction-templates/Vigogne-Instruct.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + 'Ci-dessous se trouve une instruction qui décrit une tâche à accomplir. Rédigez une réponse qui répond de manière précise à la demande.' + '\n\n' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '\n\n' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### Instruction:\n' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'### Réponse:\n' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Réponse:\n'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Wizard-Mega ShareGPT.yaml b/user_data/instruction-templates/Wizard-Mega ShareGPT.yaml
deleted file mode 100644
index 16a3ff7b..00000000
--- a/user_data/instruction-templates/Wizard-Mega ShareGPT.yaml	
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'USER: ' + message['content'] + ' '-}}
-          {%- else -%}
-              {{-'ASSISTANT: ' + message['content'] + '</s>' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'ASSISTANT:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Wizard-Mega.yaml b/user_data/instruction-templates/Wizard-Mega.yaml
deleted file mode 100644
index f3ca6990..00000000
--- a/user_data/instruction-templates/Wizard-Mega.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'### Instruction: ' + message['content'] + '\n\n'-}}
-          {%- else -%}
-              {{-'### Assistant: ' + message['content'] + '\n\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'### Assistant:'-}}
-  {%- endif -%}
-
diff --git a/user_data/instruction-templates/Ziya.yaml b/user_data/instruction-templates/Ziya.yaml
deleted file mode 100644
index 45aa9c30..00000000
--- a/user_data/instruction-templates/Ziya.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-instruction_template: |-
-  {%- set ns = namespace(found=false) -%}
-  {%- for message in messages -%}
-      {%- if message['role'] == 'system' -%}
-          {%- set ns.found = true -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if not ns.found -%}
-      {{- '' + '' + '' -}}
-  {%- endif %}
-  {%- for message in messages %}
-      {%- if message['role'] == 'system' -%}
-          {{- '' + message['content'] + '' -}}
-      {%- else -%}
-          {%- if message['role'] == 'user' -%}
-              {{-'<human>:' + message['content'] + '\n'-}}
-          {%- else -%}
-              {{-'<bot>:' + message['content'] + '\n' -}}
-          {%- endif -%}
-      {%- endif -%}
-  {%- endfor -%}
-  {%- if add_generation_prompt -%}
-      {{-'<bot>:'-}}
-  {%- endif -%}
-

From e0e20ab9e7f0dfc529898b80c1a6c44561e85658 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 19 Mar 2026 08:02:23 -0700
Subject: [PATCH 144/210] Minor cleanup across multiple modules

---
 extensions/openai/completions.py |  4 +-
 modules/llama_cpp_server.py      |  5 +--
 modules/shared.py                |  6 +--
 modules/tool_parsing.py          | 76 ++++++--------------------------
 modules/training.py              | 12 ++---
 modules/ui.py                    |  7 ++-
 6 files changed, 28 insertions(+), 82 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index fc17a19a..d0cd9802 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -263,7 +263,7 @@ def convert_history(history):
             seen_non_system = True
             meta = {}
             tool_calls = entry.get("tool_calls")
-            if tool_calls and isinstance(tool_calls, list) and len(tool_calls) > 0:
+            if tool_calls and isinstance(tool_calls, list):
                 meta["tool_calls"] = tool_calls
                 if content.strip() == "":
                     content = ""  # keep empty content, don't skip
@@ -315,7 +315,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         raise InvalidRequestError(message="messages is required", param='messages')
 
     tools = None
-    if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0:
+    if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and body['tools']:
         tools = validateTools(body['tools'])  # raises InvalidRequestError if validation fails
 
     tool_choice = body.get('tool_choice', None)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 6dd36b2a..2ae01ddc 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -500,9 +500,8 @@ class LlamaServer:
         health_url = f"http://127.0.0.1:{self.port}/health"
         while True:
             # Check if process is still alive
-            if self.process.poll() is not None:
-                # Process has terminated
-                exit_code = self.process.poll()
+            exit_code = self.process.poll()
+            if exit_code is not None:
                 raise RuntimeError(f"Server process terminated unexpectedly with exit code: {exit_code}")
 
             try:
diff --git a/modules/shared.py b/modules/shared.py
index 2382e714..37bc5876 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -453,15 +453,11 @@ def load_user_config():
     '''
     Loads custom model-specific settings
     '''
+    user_config = {}
     if Path(f'{args.model_dir}/config-user.yaml').exists():
         file_content = open(f'{args.model_dir}/config-user.yaml', 'r').read().strip()
-
         if file_content:
             user_config = yaml.safe_load(file_content)
-        else:
-            user_config = {}
-    else:
-        user_config = {}
 
     return user_config
 
diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
index 0454e901..7a7ed5d8 100644
--- a/modules/tool_parsing.py
+++ b/modules/tool_parsing.py
@@ -3,6 +3,10 @@ import random
 import re
 
 
+def _make_tool_call(name, arguments):
+    return {"type": "function", "function": {"name": name, "arguments": arguments}}
+
+
 def get_tool_call_id() -> str:
     letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789"
     b = [random.choice(letter_bytes) for _ in range(8)]
@@ -149,13 +153,7 @@ def _parse_channel_tool_calls(answer: str, tool_names: list[str]):
                 if start_pos is None:
                     prefix = answer.rfind('<|start|>assistant', 0, m.start())
                     start_pos = prefix if prefix != -1 else m.start()
-                matches.append({
-                    "type": "function",
-                    "function": {
-                        "name": func_name,
-                        "arguments": arguments
-                    }
-                })
+                matches.append(_make_tool_call(func_name, arguments))
             except json.JSONDecodeError:
                 pass
         if matches:
@@ -185,13 +183,7 @@ def _parse_mistral_token_tool_calls(answer: str, tool_names: list[str]):
             arguments = json.loads(json_str)
             if start_pos is None:
                 start_pos = m.start()
-            matches.append({
-                "type": "function",
-                "function": {
-                    "name": func_name,
-                    "arguments": arguments
-                }
-            })
+            matches.append(_make_tool_call(func_name, arguments))
         except json.JSONDecodeError:
             pass
     return matches, start_pos
@@ -226,13 +218,7 @@ def _parse_bare_name_tool_calls(answer: str, tool_names: list[str]):
             arguments = json.loads(json_str)
             if start_pos is None:
                 start_pos = match.start()
-            matches.append({
-                "type": "function",
-                "function": {
-                    "name": name,
-                    "arguments": arguments
-                }
-            })
+            matches.append(_make_tool_call(name, arguments))
         except json.JSONDecodeError:
             pass
     return matches, start_pos
@@ -269,13 +255,7 @@ def _parse_xml_param_tool_calls(answer: str, tool_names: list[str]):
             arguments[param_name] = param_value
         if start_pos is None:
             start_pos = tc_match.start()
-        matches.append({
-            "type": "function",
-            "function": {
-                "name": func_name,
-                "arguments": arguments
-            }
-        })
+        matches.append(_make_tool_call(func_name, arguments))
     return matches, start_pos
 
 
@@ -305,13 +285,7 @@ def _parse_kimi_tool_calls(answer: str, tool_names: list[str]):
                 # Check for section begin marker before the call marker
                 section = answer.rfind('<|tool_calls_section_begin|>', 0, m.start())
                 start_pos = section if section != -1 else m.start()
-            matches.append({
-                "type": "function",
-                "function": {
-                    "name": func_name,
-                    "arguments": arguments
-                }
-            })
+            matches.append(_make_tool_call(func_name, arguments))
         except json.JSONDecodeError:
             pass
     return matches, start_pos
@@ -348,13 +322,7 @@ def _parse_minimax_tool_calls(answer: str, tool_names: list[str]):
                 arguments[param_name] = param_value
             if start_pos is None:
                 start_pos = tc_match.start()
-            matches.append({
-                "type": "function",
-                "function": {
-                    "name": func_name,
-                    "arguments": arguments
-                }
-            })
+            matches.append(_make_tool_call(func_name, arguments))
     return matches, start_pos
 
 
@@ -382,13 +350,7 @@ def _parse_deep_seek_tool_calls(answer: str, tool_names: list[str]):
                 # Check for section begin marker before the call marker
                 section = answer.rfind('<｜tool▁calls▁begin｜>', 0, m.start())
                 start_pos = section if section != -1 else m.start()
-            matches.append({
-                "type": "function",
-                "function": {
-                    "name": func_name,
-                    "arguments": arguments
-                }
-            })
+            matches.append(_make_tool_call(func_name, arguments))
         except json.JSONDecodeError:
             pass
     return matches, start_pos
@@ -428,13 +390,7 @@ def _parse_glm_tool_calls(answer: str, tool_names: list[str]):
             arguments[k] = v
         if start_pos is None:
             start_pos = tc_match.start()
-        matches.append({
-            "type": "function",
-            "function": {
-                "name": func_name,
-                "arguments": arguments
-            }
-        })
+        matches.append(_make_tool_call(func_name, arguments))
     return matches, start_pos
 
 
@@ -486,13 +442,7 @@ def _parse_pythonic_tool_calls(answer: str, tool_names: list[str]):
 
         if start_pos is None:
             start_pos = bracket_match.start()
-        matches.append({
-            "type": "function",
-            "function": {
-                "name": func_name,
-                "arguments": arguments
-            }
-        })
+        matches.append(_make_tool_call(func_name, arguments))
 
     return matches, start_pos
 
diff --git a/modules/training.py b/modules/training.py
index a13a2864..145353c6 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -732,11 +732,13 @@ def do_train(lora_name: str, always_override: bool, all_linear: bool, q_proj_en:
     if lora_all_param > 0:
         print(f"Trainable params: {lora_trainable_param:,d} ({100 * lora_trainable_param / lora_all_param:.4f} %), All params: {lora_all_param:,d} (Model: {model_all_params:,d})")
 
-    train_log.update({"base_model_name": shared.model_name})
-    train_log.update({"base_model_class": shared.model.__class__.__name__})
-    train_log.update({"base_loaded_in_4bit": getattr(lora_model, "is_loaded_in_4bit", False)})
-    train_log.update({"base_loaded_in_8bit": getattr(lora_model, "is_loaded_in_8bit", False)})
-    train_log.update({"projections": projections_string})
+    train_log.update({
+        "base_model_name": shared.model_name,
+        "base_model_class": shared.model.__class__.__name__,
+        "base_loaded_in_4bit": getattr(lora_model, "is_loaded_in_4bit", False),
+        "base_loaded_in_8bit": getattr(lora_model, "is_loaded_in_8bit", False),
+        "projections": projections_string,
+    })
 
     if stop_at_loss > 0:
         print(f"Monitoring loss \033[1;31;1m(Auto-Stop at: {stop_at_loss})\033[0;37;0m")
diff --git a/modules/ui.py b/modules/ui.py
index bbb22266..20bc8373 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -299,7 +299,7 @@ def apply_interface_values(state, use_persistent=False):
 
     elements = list_interface_input_elements()
 
-    if len(state) == 0:
+    if not state:
         return [gr.update() for k in elements]  # Dummy, do nothing
     else:
         return [state[k] if k in state else gr.update() for k in elements]
@@ -307,9 +307,8 @@ def apply_interface_values(state, use_persistent=False):
 
 def save_settings(state, preset, extensions_list, show_controls, theme_state, manual_save=False):
     output = copy.deepcopy(shared.settings)
-    exclude = []
     for k in state:
-        if k in shared.settings and k not in exclude:
+        if k in shared.settings:
             output[k] = state[k]
 
     if preset:
@@ -323,7 +322,7 @@ def save_settings(state, preset, extensions_list, show_controls, theme_state, ma
     output['custom_stopping_strings'] = output.get('custom_stopping_strings') or ''
     output['custom_token_bans'] = output.get('custom_token_bans') or ''
     output['show_controls'] = show_controls
-    output['dark_theme'] = True if theme_state == 'dark' else False
+    output['dark_theme'] = theme_state == 'dark'
     output.pop('instruction_template_str')
     output.pop('truncation_length')
 

From b3eb0e313d7f74e3f90c949d54f453d3f6846ae0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 19 Mar 2026 11:53:12 -0700
Subject: [PATCH 145/210] Reduce the size of portable builds by using stripped
 Python

---
 .github/workflows/build-portable-release-cuda.yml   | 4 ++--
 .github/workflows/build-portable-release-rocm.yml   | 4 ++--
 .github/workflows/build-portable-release-vulkan.yml | 4 ++--
 .github/workflows/build-portable-release.yml        | 8 ++++----
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml
index a5759112..5d66bd77 100644
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@@ -116,13 +116,13 @@ jobs:
             # 1. Set platform-specific variables
             if [[ "$RUNNER_OS" == "Windows" ]]; then
                 PLATFORM="windows"
-                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only.tar.gz"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/python.exe -m pip"
                 PACKAGES_PATH="portable_env/Lib/site-packages"
                 rm start_linux.sh start_macos.sh
             else
                 PLATFORM="linux"
-                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only.tar.gz"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/bin/python -m pip"
                 PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
                 rm start_macos.sh start_windows.bat
diff --git a/.github/workflows/build-portable-release-rocm.yml b/.github/workflows/build-portable-release-rocm.yml
index 1050fa7e..b9a10bac 100644
--- a/.github/workflows/build-portable-release-rocm.yml
+++ b/.github/workflows/build-portable-release-rocm.yml
@@ -114,13 +114,13 @@ jobs:
             # 1. Set platform-specific variables
             if [[ "$RUNNER_OS" == "Windows" ]]; then
                 PLATFORM="windows"
-                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only.tar.gz"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/python.exe -m pip"
                 PACKAGES_PATH="portable_env/Lib/site-packages"
                 rm start_linux.sh start_macos.sh
             else
                 PLATFORM="linux"
-                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only.tar.gz"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/bin/python -m pip"
                 PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
                 rm start_macos.sh start_windows.bat
diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml
index b98b2e5e..9748d5b8 100644
--- a/.github/workflows/build-portable-release-vulkan.yml
+++ b/.github/workflows/build-portable-release-vulkan.yml
@@ -114,13 +114,13 @@ jobs:
             # 1. Set platform-specific variables
             if [[ "$RUNNER_OS" == "Windows" ]]; then
                 PLATFORM="windows"
-                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only.tar.gz"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/python.exe -m pip"
                 PACKAGES_PATH="portable_env/Lib/site-packages"
                 rm start_linux.sh start_macos.sh
             else
                 PLATFORM="linux"
-                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only.tar.gz"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/bin/python -m pip"
                 PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
                 rm start_macos.sh start_windows.bat
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index 1bd4e163..e03116f6 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -115,18 +115,18 @@ jobs:
             # 1. Set platform-specific variables
             if [[ "$RUNNER_OS" == "Windows" ]]; then
                 PLATFORM="windows-cpu"
-                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only.tar.gz"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/python.exe -m pip"
                 PACKAGES_PATH="portable_env/Lib/site-packages"
                 rm start_linux.sh start_macos.sh
             elif [[ "$RUNNER_OS" == "macOS" ]]; then
                 if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
                     PLATFORM="macos-x86_64"
-                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only.tar.gz"
+                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz"
                     REQ_TYPE="apple_intel"
                 else
                     PLATFORM="macos-arm64"
-                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only.tar.gz"
+                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz"
                     REQ_TYPE="apple_silicon"
                 fi
                 PIP_PATH="portable_env/bin/python -m pip"
@@ -135,7 +135,7 @@ jobs:
             else
                 # Linux case
                 PLATFORM="linux-cpu"
-                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only.tar.gz"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/bin/python -m pip"
                 PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
                 rm start_macos.sh start_windows.bat

From 843de8b8a81edbd825cb03eb28af594fd3c7f3b1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 19 Mar 2026 18:49:36 -0700
Subject: [PATCH 146/210] Update exllamav3 to 0.0.26

---
 requirements/full/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index c8479d04..ad68ad59 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -42,7 +42,7 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.25/exllamav3-0.0.25+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.25/exllamav3-0.0.25+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"

From 2e4232e02bdf7640470ba1efdc5e72f1cd56b867 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 20 Mar 2026 07:20:26 -0700
Subject: [PATCH 147/210] Minor cleanup

---
 modules/callbacks.py | 2 +-
 modules/utils.py     | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index 89fb6c08..6288de29 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -34,7 +34,7 @@ class Iteratorize:
 
         def gentask():
             try:
-                ret = self.mfunc(callback=_callback, *args, **self.kwargs)
+                ret = self.mfunc(callback=_callback, *self.args, **self.kwargs)
             except StopNowException:
                 pass
             except Exception:
diff --git a/modules/utils.py b/modules/utils.py
index ff32e974..b01953ee 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -81,14 +81,6 @@ def atoi(text):
     return int(text) if text.isdigit() else text.lower()
 
 
-# Replace multiple string pairs in a string
-def replace_all(text, dic):
-    for i, j in dic.items():
-        text = text.replace(i, j)
-
-    return text
-
-
 def natural_keys(text):
     return [atoi(c) for c in re.split(r'(\d+)', text)]
 

From bf6fbc019dbd9470efdeafa033818efa178d7735 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 20 Mar 2026 14:46:00 -0300
Subject: [PATCH 148/210] API: Move OpenAI-compatible API from
 extensions/openai to modules/api

---
 .../workflows/build-portable-release-cuda.yml |  2 +-
 .../workflows/build-portable-release-rocm.yml |  2 +-
 .../build-portable-release-vulkan.yml         |  2 +-
 .github/workflows/build-portable-release.yml  |  2 +-
 docs/07 - Extensions.md                       |  1 -
 docs/12 - OpenAI API.md                       | 12 +------
 modules/api/__init__.py                       |  0
 .../api}/cache_embedding_model.py             |  0
 .../openai => modules/api}/completions.py     |  6 ++--
 .../openai => modules/api}/embeddings.py      | 10 +++---
 {extensions/openai => modules/api}/errors.py  |  0
 {extensions/openai => modules/api}/images.py  |  2 +-
 {extensions/openai => modules/api}/logits.py  |  2 +-
 {extensions/openai => modules/api}/models.py  |  0
 .../openai => modules/api}/moderations.py     |  2 +-
 {extensions/openai => modules/api}/script.py  | 34 ++++++++++---------
 {extensions/openai => modules/api}/tokens.py  |  0
 {extensions/openai => modules/api}/typing.py  |  0
 {extensions/openai => modules/api}/utils.py   |  3 +-
 modules/extensions.py                         |  3 +-
 modules/shared.py                             | 16 +--------
 modules/ui_session.py                         |  2 --
 server.py                                     | 15 ++++++++
 23 files changed, 51 insertions(+), 65 deletions(-)
 create mode 100644 modules/api/__init__.py
 rename {extensions/openai => modules/api}/cache_embedding_model.py (100%)
 rename {extensions/openai => modules/api}/completions.py (99%)
 rename {extensions/openai => modules/api}/embeddings.py (90%)
 rename {extensions/openai => modules/api}/errors.py (100%)
 rename {extensions/openai => modules/api}/images.py (96%)
 rename {extensions/openai => modules/api}/logits.py (84%)
 rename {extensions/openai => modules/api}/models.py (100%)
 rename {extensions/openai => modules/api}/moderations.py (97%)
 rename {extensions/openai => modules/api}/script.py (96%)
 rename {extensions/openai => modules/api}/tokens.py (100%)
 rename {extensions/openai => modules/api}/typing.py (100%)
 rename {extensions/openai => modules/api}/utils.py (93%)

diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml
index 5d66bd77..f9eea58a 100644
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@@ -106,7 +106,7 @@ jobs:
             cd "text-generation-webui-${VERSION_CLEAN}"
 
             # Remove extensions that need additional requirements
-            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            allowed=("character_bias" "gallery" "sd_api_pictures")
             find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
diff --git a/.github/workflows/build-portable-release-rocm.yml b/.github/workflows/build-portable-release-rocm.yml
index b9a10bac..db42b7dc 100644
--- a/.github/workflows/build-portable-release-rocm.yml
+++ b/.github/workflows/build-portable-release-rocm.yml
@@ -105,7 +105,7 @@ jobs:
             cd "text-generation-webui-${VERSION_CLEAN}"
 
             # Remove extensions that need additional requirements
-            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            allowed=("character_bias" "gallery" "sd_api_pictures")
             find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml
index 9748d5b8..8f5aa7c8 100644
--- a/.github/workflows/build-portable-release-vulkan.yml
+++ b/.github/workflows/build-portable-release-vulkan.yml
@@ -105,7 +105,7 @@ jobs:
             cd "text-generation-webui-${VERSION_CLEAN}"
 
             # Remove extensions that need additional requirements
-            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            allowed=("character_bias" "gallery" "sd_api_pictures")
             find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index e03116f6..9ace90f6 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -105,7 +105,7 @@ jobs:
             cd "text-generation-webui-${VERSION_CLEAN}"
 
             # Remove extensions that need additional requirements
-            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            allowed=("character_bias" "gallery" "sd_api_pictures")
             find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
diff --git a/docs/07 - Extensions.md b/docs/07 - Extensions.md
index 48cd30ce..779b2a34 100644
--- a/docs/07 - Extensions.md	
+++ b/docs/07 - Extensions.md	
@@ -20,7 +20,6 @@ If you create an extension, you are welcome to host it in a GitHub repository an
 
 |Extension|Description|
 |---------|-----------|
-|[openai](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai)| Creates an API that mimics the OpenAI API and can be used as a drop-in replacement. |
 |[superboogav2](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/superboogav2)| Enhanced RAG extension with support for PDF, DOCX, and PPTX files. |
 |[send_pictures](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/send_pictures/)| Creates an image upload field that can be used to send images to the bot in chat mode. Captions are automatically generated using BLIP. |
 |[coqui_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/coqui_tts)| Text-to-speech extension using Coqui XTTS v2. |
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 637ccced..276a7e19 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -19,7 +19,7 @@ Add `--api` to your command-line flags.
 
 ### Examples
 
-For the documentation with all the endpoints, parameters and their types, consult `http://127.0.0.1:5000/docs` or the [typing.py](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/openai/typing.py) file.
+For the documentation with all the endpoints, parameters and their types, consult `http://127.0.0.1:5000/docs` or the [typing.py](https://github.com/oobabooga/text-generation-webui/blob/main/modules/api/typing.py) file.
 
 The official examples in the [OpenAI documentation](https://platform.openai.com/docs/api-reference) should also work, and the same parameters apply (although the API here has more optional parameters).
 
@@ -490,16 +490,6 @@ The following environment variables can be used (they take precedence over every
 | `OPENEDAI_EMBEDDING_MODEL` | Embedding model (if applicable) |          sentence-transformers/all-mpnet-base-v2                  |
 | `OPENEDAI_EMBEDDING_DEVICE` | Embedding device (if applicable) |           cuda                 |
 
-#### Persistent settings with `settings.yaml`
-
-You can also set the following variables in your `settings.yaml` file:
-
-```
-openai-embedding_device: cuda
-openai-embedding_model: "sentence-transformers/all-mpnet-base-v2"
-openai-debug: 1
-```
-
 ### Third-party application setup
 
 You can usually force an application that uses the OpenAI API to connect to the local API by using the following environment variables:
diff --git a/modules/api/__init__.py b/modules/api/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/extensions/openai/cache_embedding_model.py b/modules/api/cache_embedding_model.py
similarity index 100%
rename from extensions/openai/cache_embedding_model.py
rename to modules/api/cache_embedding_model.py
diff --git a/extensions/openai/completions.py b/modules/api/completions.py
similarity index 99%
rename from extensions/openai/completions.py
rename to modules/api/completions.py
index d0cd9802..8948bb86 100644
--- a/extensions/openai/completions.py
+++ b/modules/api/completions.py
@@ -9,9 +9,9 @@ import tiktoken
 import yaml
 from pydantic import ValidationError
 
-from extensions.openai.errors import InvalidRequestError
-from extensions.openai.typing import ToolDefinition
-from extensions.openai.utils import debug_msg
+from .errors import InvalidRequestError
+from .typing import ToolDefinition
+from .utils import debug_msg
 from modules.tool_parsing import get_tool_call_id, parse_tool_call, detect_tool_call_format
 from modules import shared
 from modules.reasoning import extract_reasoning
diff --git a/extensions/openai/embeddings.py b/modules/api/embeddings.py
similarity index 90%
rename from extensions/openai/embeddings.py
rename to modules/api/embeddings.py
index 1420879c..ad299c9d 100644
--- a/extensions/openai/embeddings.py
+++ b/modules/api/embeddings.py
@@ -3,8 +3,8 @@ import os
 import numpy as np
 from transformers import AutoModel
 
-from extensions.openai.errors import ServiceUnavailableError
-from extensions.openai.utils import debug_msg, float_list_to_base64
+from .errors import ServiceUnavailableError
+from .utils import debug_msg, float_list_to_base64
 from modules.logging_colors import logger
 
 embeddings_params_initialized = False
@@ -17,14 +17,12 @@ def initialize_embedding_params():
     '''
     global embeddings_params_initialized
     if not embeddings_params_initialized:
-        from extensions.openai.script import params
-
         global st_model, embeddings_model, embeddings_device
 
-        st_model = os.environ.get("OPENEDAI_EMBEDDING_MODEL", params.get('embedding_model', 'all-mpnet-base-v2'))
+        st_model = os.environ.get("OPENEDAI_EMBEDDING_MODEL", 'sentence-transformers/all-mpnet-base-v2')
         embeddings_model = None
         # OPENEDAI_EMBEDDING_DEVICE: auto (best or cpu), cpu, cuda, ipu, xpu, mkldnn, opengl, opencl, ideep, hip, ve, fpga, ort, xla, lazy, vulkan, mps, meta, hpu, mtia, privateuseone
-        embeddings_device = os.environ.get("OPENEDAI_EMBEDDING_DEVICE", params.get('embedding_device', 'cpu'))
+        embeddings_device = os.environ.get("OPENEDAI_EMBEDDING_DEVICE", 'cpu')
         if embeddings_device.lower() == 'auto':
             embeddings_device = None
 
diff --git a/extensions/openai/errors.py b/modules/api/errors.py
similarity index 100%
rename from extensions/openai/errors.py
rename to modules/api/errors.py
diff --git a/extensions/openai/images.py b/modules/api/images.py
similarity index 96%
rename from extensions/openai/images.py
rename to modules/api/images.py
index f7be3d22..95704535 100644
--- a/extensions/openai/images.py
+++ b/modules/api/images.py
@@ -6,7 +6,7 @@ import base64
 import io
 import time
 
-from extensions.openai.errors import ServiceUnavailableError
+from .errors import ServiceUnavailableError
 from modules import shared
 
 
diff --git a/extensions/openai/logits.py b/modules/api/logits.py
similarity index 84%
rename from extensions/openai/logits.py
rename to modules/api/logits.py
index 280612db..e0c7ea0e 100644
--- a/extensions/openai/logits.py
+++ b/modules/api/logits.py
@@ -1,4 +1,4 @@
-from extensions.openai.completions import process_parameters
+from .completions import process_parameters
 from modules.logits import get_next_logits
 
 
diff --git a/extensions/openai/models.py b/modules/api/models.py
similarity index 100%
rename from extensions/openai/models.py
rename to modules/api/models.py
diff --git a/extensions/openai/moderations.py b/modules/api/moderations.py
similarity index 97%
rename from extensions/openai/moderations.py
rename to modules/api/moderations.py
index 1ca6b8ab..ac0539d6 100644
--- a/extensions/openai/moderations.py
+++ b/modules/api/moderations.py
@@ -3,7 +3,7 @@ import time
 import numpy as np
 from numpy.linalg import norm
 
-from extensions.openai.embeddings import get_embeddings
+from .embeddings import get_embeddings
 
 moderations_disabled = False  # return 0/false
 category_embeddings = None
diff --git a/extensions/openai/script.py b/modules/api/script.py
similarity index 96%
rename from extensions/openai/script.py
rename to modules/api/script.py
index a0d5deb8..356919e9 100644
--- a/extensions/openai/script.py
+++ b/modules/api/script.py
@@ -13,16 +13,15 @@ from fastapi import Depends, FastAPI, Header, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.requests import Request
 from fastapi.responses import JSONResponse
-from pydub import AudioSegment
 from sse_starlette import EventSourceResponse
 from starlette.concurrency import iterate_in_threadpool
 
-import extensions.openai.completions as OAIcompletions
-import extensions.openai.logits as OAIlogits
-import extensions.openai.models as OAImodels
-from extensions.openai.tokens import token_count, token_decode, token_encode
-from extensions.openai.errors import OpenAIError
-from extensions.openai.utils import _start_cloudflared
+import modules.api.completions as OAIcompletions
+import modules.api.logits as OAIlogits
+import modules.api.models as OAImodels
+from .tokens import token_count, token_decode, token_encode
+from .errors import OpenAIError
+from .utils import _start_cloudflared
 from modules import shared
 from modules.logging_colors import logger
 from modules.models import unload_model
@@ -53,12 +52,6 @@ from .typing import (
     to_dict
 )
 
-params = {
-    'embedding_device': 'cpu',
-    'embedding_model': 'sentence-transformers/all-mpnet-base-v2',
-    'debug': 0
-}
-
 
 async def _wait_for_disconnect(request: Request, stop_event: threading.Event):
     """Block until the client disconnects, then signal the stop_event."""
@@ -244,6 +237,7 @@ def handle_billing_usage():
 @app.post('/v1/audio/transcriptions', dependencies=check_key)
 async def handle_audio_transcription(request: Request):
     import speech_recognition as sr
+    from pydub import AudioSegment
 
     r = sr.Recognizer()
 
@@ -275,7 +269,7 @@ async def handle_audio_transcription(request: Request):
 
 @app.post('/v1/images/generations', response_model=ImageGenerationResponse, dependencies=check_key)
 async def handle_image_generation(request_data: ImageGenerationRequest):
-    import extensions.openai.images as OAIimages
+    import modules.api.images as OAIimages
 
     response = await asyncio.to_thread(OAIimages.generations, request_data)
     return JSONResponse(response)
@@ -283,7 +277,7 @@ async def handle_image_generation(request_data: ImageGenerationRequest):
 
 @app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key)
 async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
-    import extensions.openai.embeddings as OAIembeddings
+    import modules.api.embeddings as OAIembeddings
 
     input = request_data.input
     if not input:
@@ -298,7 +292,7 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
 
 @app.post("/v1/moderations", dependencies=check_key)
 async def handle_moderations(request: Request):
-    import extensions.openai.moderations as OAImoderations
+    import modules.api.moderations as OAImoderations
 
     body = await request.json()
     input = body["input"]
@@ -500,7 +494,15 @@ def run_server():
     uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
 
 
+_server_started = False
+
+
 def setup():
+    global _server_started
+    if _server_started:
+        return
+
+    _server_started = True
     if shared.args.nowebui:
         run_server()
     else:
diff --git a/extensions/openai/tokens.py b/modules/api/tokens.py
similarity index 100%
rename from extensions/openai/tokens.py
rename to modules/api/tokens.py
diff --git a/extensions/openai/typing.py b/modules/api/typing.py
similarity index 100%
rename from extensions/openai/typing.py
rename to modules/api/typing.py
diff --git a/extensions/openai/utils.py b/modules/api/utils.py
similarity index 93%
rename from extensions/openai/utils.py
rename to modules/api/utils.py
index 2b414769..fae181ff 100644
--- a/extensions/openai/utils.py
+++ b/modules/api/utils.py
@@ -23,8 +23,7 @@ def float_list_to_base64(float_array: np.ndarray) -> str:
 
 
 def debug_msg(*args, **kwargs):
-    from extensions.openai.script import params
-    if os.environ.get("OPENEDAI_DEBUG", params.get('debug', 0)):
+    if os.environ.get("OPENEDAI_DEBUG", 0):
         print(*args, **kwargs)
 
 
diff --git a/modules/extensions.py b/modules/extensions.py
index 4bb7b683..09db9f40 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -32,8 +32,7 @@ def load_extensions():
         if name not in available_extensions:
             continue
 
-        if name != 'api':
-            logger.info(f'Loading the extension "{name}"')
+        logger.info(f'Loading the extension "{name}"')
 
         try:
             # Prefer user extension, fall back to system extension
diff --git a/modules/shared.py b/modules/shared.py
index 37bc5876..69e16960 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -156,7 +156,7 @@ group.add_argument('--portable', action='store_true', help='Hide features not av
 
 # API
 group = parser.add_argument_group('API')
-group.add_argument('--api', action='store_true', help='Enable the API extension.')
+group.add_argument('--api', action='store_true', help='Enable the API server.')
 group.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudflare.')
 group.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.', default=None)
 group.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.')
@@ -435,16 +435,6 @@ def fix_loader_name(name):
         return 'TensorRT-LLM'
 
 
-def add_extension(name, last=False):
-    if args.extensions is None:
-        args.extensions = [name]
-    elif last:
-        args.extensions = [x for x in args.extensions if x != name]
-        args.extensions.append(name)
-    elif name not in args.extensions:
-        args.extensions.append(name)
-
-
 def is_chat():
     return True
 
@@ -464,10 +454,6 @@ def load_user_config():
 
 args.loader = fix_loader_name(args.loader)
 
-# Activate the API extension
-if args.api or args.public_api:
-    add_extension('openai', last=True)
-
 # Load model-specific settings
 p = Path(f'{args.model_dir}/config.yaml')
 if p.exists():
diff --git a/modules/ui_session.py b/modules/ui_session.py
index 19026fbb..3f2c8a7b 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -95,8 +95,6 @@ def set_interface_arguments(extensions, bool_active):
         setattr(shared.args, k, False)
     for k in bool_active:
         setattr(shared.args, k, True)
-        if k == 'api':
-            shared.add_extension('openai', last=True)
 
     shared.need_restart = True
 
diff --git a/server.py b/server.py
index 1aa9fc04..cbdd2854 100644
--- a/server.py
+++ b/server.py
@@ -106,6 +106,11 @@ def create_interface():
     if shared.args.extensions is not None and len(shared.args.extensions) > 0:
         extensions_module.load_extensions()
 
+    # Start the API server if enabled
+    if shared.args.api or shared.args.public_api:
+        from modules.api.script import setup as api_setup
+        api_setup()
+
     # Force some events to be triggered on page load
     shared.persistent_interface_state.update({
         'mode': shared.settings['mode'],
@@ -273,6 +278,12 @@ if __name__ == "__main__":
     # Activate the extensions listed on settings.yaml
     extensions_module.available_extensions = utils.get_available_extensions()
     for extension in shared.settings['default_extensions']:
+        # The openai extension was moved to modules/api and is now
+        # activated with --api. Treat it as an alias for backwards compat.
+        if extension == 'openai':
+            shared.args.api = True
+            continue
+
         shared.args.extensions = shared.args.extensions or []
         if extension not in shared.args.extensions:
             shared.args.extensions.append(extension)
@@ -337,6 +348,10 @@ if __name__ == "__main__":
         shared.args.extensions = [x for x in (shared.args.extensions or []) if x != 'gallery']
         if shared.args.extensions:
             extensions_module.load_extensions()
+
+        if shared.args.api or shared.args.public_api:
+            from modules.api.script import setup as api_setup
+            api_setup()
     else:
         # Launch the web UI
         create_interface()

From 1a910574c36b6b1d93a3bf3303335201993f503a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 20 Mar 2026 14:57:01 -0300
Subject: [PATCH 149/210] API: Fix debug_msg truthy check for OPENEDAI_DEBUG=0

---
 modules/api/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/api/utils.py b/modules/api/utils.py
index fae181ff..f021c378 100644
--- a/modules/api/utils.py
+++ b/modules/api/utils.py
@@ -23,7 +23,7 @@ def float_list_to_base64(float_array: np.ndarray) -> str:
 
 
 def debug_msg(*args, **kwargs):
-    if os.environ.get("OPENEDAI_DEBUG", 0):
+    if int(os.environ.get("OPENEDAI_DEBUG", 0)):
         print(*args, **kwargs)
 
 

From 855141967c4081f9f90a1b5b7fd091a14c543e8f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 20 Mar 2026 15:03:17 -0300
Subject: [PATCH 150/210] API: Handle --extensions openai as alias for --api

---
 server.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/server.py b/server.py
index cbdd2854..d224909c 100644
--- a/server.py
+++ b/server.py
@@ -288,6 +288,11 @@ if __name__ == "__main__":
         if extension not in shared.args.extensions:
             shared.args.extensions.append(extension)
 
+    # Handle --extensions openai from the command line (moved to modules/api)
+    if shared.args.extensions and 'openai' in shared.args.extensions:
+        shared.args.extensions.remove('openai')
+        shared.args.api = True
+
     # Load image model if specified via CLI
     if shared.args.image_model:
         logger.info(f"Loading image model: {shared.args.image_model}")

From 7c79143a149d1618287ca0b526826ee04167f7d9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 20 Mar 2026 15:03:49 -0300
Subject: [PATCH 151/210] API: Fix _start_cloudflared raising after first
 attempt instead of exhausting retries

---
 modules/api/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/api/utils.py b/modules/api/utils.py
index f021c378..e8c505f6 100644
--- a/modules/api/utils.py
+++ b/modules/api/utils.py
@@ -50,4 +50,4 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
             traceback.print_exc()
             time.sleep(3)
 
-        raise Exception('Could not start cloudflared.')
+    raise Exception('Could not start cloudflared.')

From f0e3997f375d61961c7032a09145f41c254d799f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 20 Mar 2026 16:04:57 -0300
Subject: [PATCH 152/210] Add missing __init__.py to modules/grammar

---
 modules/grammar/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 modules/grammar/__init__.py

diff --git a/modules/grammar/__init__.py b/modules/grammar/__init__.py
new file mode 100644
index 00000000..e69de29b

From 0216893475b415106ce631f62fc62bcd9d345f8a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 20 Mar 2026 19:05:36 -0300
Subject: [PATCH 153/210] API: Add Anthropic-compatible /v1/messages endpoint

---
 modules/api/anthropic.py | 468 +++++++++++++++++++++++++++++++++++++++
 modules/api/script.py    | 115 +++++++++-
 modules/api/typing.py    |  21 +-
 3 files changed, 600 insertions(+), 4 deletions(-)
 create mode 100644 modules/api/anthropic.py

diff --git a/modules/api/anthropic.py b/modules/api/anthropic.py
new file mode 100644
index 00000000..5fbf5caf
--- /dev/null
+++ b/modules/api/anthropic.py
@@ -0,0 +1,468 @@
+import json
+import time
+
+from modules import shared
+
+
+def convert_request(body: dict) -> dict:
+    """Transform Anthropic Messages API body into the dict that chat_completions_common expects."""
+    messages = []
+
+    # System message
+    system = body.get('system')
+    if system:
+        if isinstance(system, list):
+            # List of content blocks like [{"type":"text","text":"..."}]
+            text_parts = [block.get('text', '') for block in system if isinstance(block, dict) and block.get('type') == 'text']
+            system_text = '\n'.join(text_parts)
+        else:
+            system_text = str(system)
+        if system_text:
+            messages.append({"role": "system", "content": system_text})
+
+    # Convert messages
+    for msg in body.get('messages', []):
+        role = msg.get('role')
+        content = msg.get('content')
+
+        if isinstance(content, str):
+            messages.append({"role": role, "content": content})
+            continue
+
+        if not isinstance(content, list):
+            messages.append({"role": role, "content": str(content) if content else ""})
+            continue
+
+        if role == 'assistant':
+            # Split into text content, tool_calls, and skip thinking blocks
+            text_parts = []
+            tool_calls = []
+            for block in content:
+                btype = block.get('type')
+                if btype == 'text':
+                    text_parts.append(block.get('text', ''))
+                elif btype == 'tool_use':
+                    tool_calls.append({
+                        "id": block.get('id', ''),
+                        "type": "function",
+                        "function": {
+                            "name": block.get('name', ''),
+                            "arguments": json.dumps(block.get('input', {}))
+                        }
+                    })
+                elif btype == 'thinking':
+                    pass  # Strip thinking blocks
+
+            assistant_msg = {"role": "assistant", "content": '\n'.join(text_parts) if text_parts else ""}
+            if tool_calls:
+                assistant_msg["tool_calls"] = tool_calls
+            messages.append(assistant_msg)
+
+        elif role == 'user':
+            # Handle tool_result blocks and regular content
+            regular_parts = []
+            for block in content:
+                btype = block.get('type')
+                if btype == 'tool_result':
+                    # Emit any accumulated regular content first
+                    if regular_parts:
+                        if len(regular_parts) == 1 and regular_parts[0].get('type') == 'text':
+                            messages.append({"role": "user", "content": regular_parts[0]['text']})
+                        else:
+                            messages.append({"role": "user", "content": regular_parts})
+                        regular_parts = []
+                    # Convert tool_result to OpenAI tool message
+                    tool_content = block.get('content', '')
+                    if isinstance(tool_content, list):
+                        tool_content = '\n'.join(
+                            b.get('text', '') for b in tool_content
+                            if isinstance(b, dict) and b.get('type') == 'text'
+                        )
+                    messages.append({
+                        "role": "tool",
+                        "tool_call_id": block.get('tool_use_id', ''),
+                        "content": str(tool_content)
+                    })
+                elif btype == 'text':
+                    regular_parts.append({"type": "text", "text": block.get('text', '')})
+                elif btype == 'image':
+                    source = block.get('source', {})
+                    if source.get('type') == 'base64':
+                        media_type = source.get('media_type', 'image/png')
+                        data = source.get('data', '')
+                        regular_parts.append({
+                            "type": "image_url",
+                            "image_url": {"url": f"data:{media_type};base64,{data}"}
+                        })
+                elif btype == 'thinking':
+                    pass  # Strip thinking blocks
+
+            if regular_parts:
+                if len(regular_parts) == 1 and regular_parts[0].get('type') == 'text':
+                    messages.append({"role": "user", "content": regular_parts[0]['text']})
+                else:
+                    messages.append({"role": "user", "content": regular_parts})
+        else:
+            messages.append({"role": role, "content": str(content)})
+
+    # Start with all fields from the original body (includes GenerationOptions defaults)
+    result = dict(body)
+
+    # Remove Anthropic-specific fields that don't map directly
+    for key in ('system', 'stop_sequences', 'tools', 'tool_choice', 'thinking', 'metadata'):
+        result.pop(key, None)
+
+    # Set converted fields
+    result['messages'] = messages
+    result['max_tokens'] = body.get('max_tokens', 4096)
+    result['stream'] = body.get('stream', False)
+    result['mode'] = 'instruct'
+
+    # Ensure ChatCompletionRequestParams defaults are present
+    result.setdefault('continue_', False)
+    result.setdefault('instruction_template', None)
+    result.setdefault('instruction_template_str', None)
+    result.setdefault('character', None)
+    result.setdefault('bot_name', None)
+    result.setdefault('context', None)
+    result.setdefault('greeting', None)
+    result.setdefault('user_name', None)
+    result.setdefault('user_bio', None)
+    result.setdefault('chat_template_str', None)
+    result.setdefault('chat_instruct_command', 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>')
+    result.setdefault('frequency_penalty', None)
+    result.setdefault('presence_penalty', None)
+    result.setdefault('logit_bias', None)
+    result.setdefault('logprobs', None)
+    result.setdefault('top_logprobs', None)
+    result.setdefault('n', 1)
+    result.setdefault('model', None)
+    result.setdefault('functions', None)
+    result.setdefault('function_call', None)
+    result.setdefault('stream_options', None)
+    result.setdefault('user', None)
+    result.setdefault('stop', None)
+    result.setdefault('tool_choice', None)
+
+    # Always request usage in streaming so the usage-only chunk triggers
+    # the deferred message_delta/message_stop with accurate output_tokens
+    if body.get('stream', False):
+        result['stream_options'] = {'include_usage': True}
+
+    # Map stop_sequences -> stop
+    if body.get('stop_sequences'):
+        result['stop'] = body['stop_sequences']
+
+    # Tools
+    if body.get('tools'):
+        result['tools'] = [
+            {
+                "type": "function",
+                "function": {
+                    "name": t.get('name', ''),
+                    "description": t.get('description', ''),
+                    "parameters": t.get('input_schema', {"type": "object", "properties": {}})
+                }
+            }
+            for t in body['tools']
+        ]
+
+    # Tool choice
+    tc = body.get('tool_choice')
+    if tc and isinstance(tc, dict):
+        tc_type = tc.get('type')
+        if tc_type == 'auto':
+            result['tool_choice'] = 'auto'
+        elif tc_type == 'any':
+            result['tool_choice'] = 'required'
+        elif tc_type == 'tool':
+            result['tool_choice'] = {"type": "function", "function": {"name": tc.get('name', '')}}
+        elif tc_type == 'none':
+            result['tool_choice'] = 'none'
+    else:
+        result.setdefault('tool_choice', None)
+
+    # Thinking
+    thinking = body.get('thinking')
+    if thinking and isinstance(thinking, dict) and thinking.get('type') in ('enabled', 'adaptive'):
+        result['enable_thinking'] = True
+
+    return result
+
+
+_FINISH_REASON_MAP = {
+    "stop": "end_turn",
+    "length": "max_tokens",
+    "tool_calls": "tool_use",
+}
+
+
+def build_response(openai_resp: dict, model: str) -> dict:
+    """Transform OpenAI chat completion response dict into Anthropic Messages format."""
+    resp_id = openai_resp.get('id', 'msg_unknown')
+    if resp_id.startswith('chatcmpl-'):
+        resp_id = 'msg_' + resp_id[9:]
+
+    choice = openai_resp.get('choices', [{}])[0]
+    message = choice.get('message', {})
+
+    content = []
+
+    # Reasoning/thinking content
+    reasoning = message.get('reasoning_content')
+    if reasoning:
+        content.append({"type": "thinking", "thinking": reasoning, "signature": ""})
+
+    # Text content
+    text = message.get('content')
+    if text:
+        content.append({"type": "text", "text": text})
+
+    # Tool calls
+    tool_calls = message.get('tool_calls')
+    if tool_calls:
+        for tc in tool_calls:
+            func = tc.get('function', {})
+            try:
+                input_data = json.loads(func.get('arguments', '{}'))
+            except (json.JSONDecodeError, TypeError):
+                input_data = {}
+            content.append({
+                "type": "tool_use",
+                "id": tc.get('id', ''),
+                "name": func.get('name', ''),
+                "input": input_data
+            })
+
+    finish_reason = choice.get('finish_reason', 'stop')
+    stop_reason = _FINISH_REASON_MAP.get(finish_reason, 'end_turn')
+
+    usage = openai_resp.get('usage', {})
+
+    return {
+        "id": resp_id,
+        "type": "message",
+        "role": "assistant",
+        "content": content,
+        "model": model,
+        "stop_reason": stop_reason,
+        "stop_sequence": None,
+        "usage": {
+            "input_tokens": usage.get('prompt_tokens', 0),
+            "output_tokens": usage.get('completion_tokens', 0),
+        }
+    }
+
+
+class StreamConverter:
+    """Stateful converter: processes one OpenAI chunk at a time, yields Anthropic SSE events.
+
+    When include_usage is enabled in the OpenAI request, the final chunk with
+    finish_reason has usage=None, followed by a separate usage-only chunk
+    (choices=[], usage={...}).  We defer emitting message_delta and message_stop
+    until we receive that usage chunk so output_tokens is accurate.
+    """
+
+    def __init__(self, model: str):
+        self.model = model
+        self.msg_id = "msg_%d" % int(time.time() * 1000000000)
+        self.block_index = 0
+        self.in_thinking = False
+        self.in_text = False
+        self.input_tokens = 0
+        self.output_tokens = 0
+        self.tool_calls_accum = {}
+        self.stop_reason = "end_turn"
+        self._pending_finish = False  # True after we've seen finish_reason
+
+    def process_chunk(self, chunk: dict) -> list[dict]:
+        """Process a single OpenAI streaming chunk; return list of Anthropic SSE event dicts."""
+        events = []
+        choices = chunk.get('choices', [])
+        usage = chunk.get('usage')
+
+        if usage:
+            self.input_tokens = usage.get('prompt_tokens', self.input_tokens)
+            self.output_tokens = usage.get('completion_tokens', self.output_tokens)
+
+        # Usage-only chunk (choices=[]) arrives after the finish chunk
+        if not choices:
+            if self._pending_finish:
+                events.extend(self.finish())
+            return events
+
+        choice = choices[0]
+        delta = choice.get('delta', {})
+        finish_reason = choice.get('finish_reason')
+
+        # First chunk with role
+        if 'role' in delta:
+            events.append({
+                "event": "message_start",
+                "data": json.dumps({
+                    "type": "message_start",
+                    "message": {
+                        "id": self.msg_id,
+                        "type": "message",
+                        "role": "assistant",
+                        "content": [],
+                        "model": self.model,
+                        "stop_reason": None,
+                        "stop_sequence": None,
+                        "usage": {"input_tokens": self.input_tokens, "output_tokens": 0}
+                    }
+                })
+            })
+            events.append({"event": "ping", "data": json.dumps({"type": "ping"})})
+            return events
+
+        # Reasoning content
+        reasoning_content = delta.get('reasoning_content')
+        if reasoning_content:
+            if not self.in_thinking:
+                self.in_thinking = True
+                events.append({
+                    "event": "content_block_start",
+                    "data": json.dumps({
+                        "type": "content_block_start",
+                        "index": self.block_index,
+                        "content_block": {"type": "thinking", "thinking": ""}
+                    })
+                })
+            events.append({
+                "event": "content_block_delta",
+                "data": json.dumps({
+                    "type": "content_block_delta",
+                    "index": self.block_index,
+                    "delta": {"type": "thinking_delta", "thinking": reasoning_content}
+                })
+            })
+            return events
+
+        # Text content
+        text_content = delta.get('content')
+        if text_content:
+            if self.in_thinking:
+                events.append({
+                    "event": "content_block_stop",
+                    "data": json.dumps({"type": "content_block_stop", "index": self.block_index})
+                })
+                self.in_thinking = False
+                self.block_index += 1
+
+            if not self.in_text:
+                self.in_text = True
+                events.append({
+                    "event": "content_block_start",
+                    "data": json.dumps({
+                        "type": "content_block_start",
+                        "index": self.block_index,
+                        "content_block": {"type": "text", "text": ""}
+                    })
+                })
+            events.append({
+                "event": "content_block_delta",
+                "data": json.dumps({
+                    "type": "content_block_delta",
+                    "index": self.block_index,
+                    "delta": {"type": "text_delta", "text": text_content}
+                })
+            })
+            return events
+
+        # Tool calls in delta
+        chunk_tool_calls = delta.get('tool_calls')
+        if chunk_tool_calls:
+            for tc in chunk_tool_calls:
+                tc_id = tc.get('id', '')
+                tc_idx = tc.get('index', 0)
+                func = tc.get('function', {})
+                if tc_id:
+                    self.tool_calls_accum[tc_idx] = {
+                        "id": tc_id,
+                        "name": func.get('name', ''),
+                        "arguments": func.get('arguments', '')
+                    }
+                elif tc_idx in self.tool_calls_accum:
+                    self.tool_calls_accum[tc_idx]["arguments"] += func.get('arguments', '')
+
+        # Final chunk — close open content blocks, defer message_delta/stop for usage
+        if finish_reason is not None:
+            self.stop_reason = _FINISH_REASON_MAP.get(finish_reason, 'end_turn')
+
+            if self.in_thinking:
+                events.append({
+                    "event": "content_block_stop",
+                    "data": json.dumps({"type": "content_block_stop", "index": self.block_index})
+                })
+                self.in_thinking = False
+                self.block_index += 1
+
+            if self.in_text:
+                events.append({
+                    "event": "content_block_stop",
+                    "data": json.dumps({"type": "content_block_stop", "index": self.block_index})
+                })
+                self.in_text = False
+                self.block_index += 1
+
+            for tc_idx in sorted(self.tool_calls_accum.keys()):
+                tc = self.tool_calls_accum[tc_idx]
+                arguments_str = tc["arguments"] or "{}"
+
+                events.append({
+                    "event": "content_block_start",
+                    "data": json.dumps({
+                        "type": "content_block_start",
+                        "index": self.block_index,
+                        "content_block": {
+                            "type": "tool_use",
+                            "id": tc["id"],
+                            "name": tc["name"],
+                            "input": {}
+                        }
+                    })
+                })
+                # Emit the full input as a single input_json_delta so SDK
+                # clients that reconstruct from deltas get the correct data
+                events.append({
+                    "event": "content_block_delta",
+                    "data": json.dumps({
+                        "type": "content_block_delta",
+                        "index": self.block_index,
+                        "delta": {
+                            "type": "input_json_delta",
+                            "partial_json": arguments_str
+                        }
+                    })
+                })
+                events.append({
+                    "event": "content_block_stop",
+                    "data": json.dumps({"type": "content_block_stop", "index": self.block_index})
+                })
+                self.block_index += 1
+
+            # Defer message_delta/stop — usage chunk may follow
+            self._pending_finish = True
+
+        return events
+
+    def finish(self) -> list[dict]:
+        """Emit deferred message_delta and message_stop. Safe to call multiple times."""
+        if not self._pending_finish:
+            return []
+        self._pending_finish = False
+        return [
+            {
+                "event": "message_delta",
+                "data": json.dumps({
+                    "type": "message_delta",
+                    "delta": {"stop_reason": self.stop_reason, "stop_sequence": None},
+                    "usage": {"input_tokens": self.input_tokens, "output_tokens": self.output_tokens}
+                })
+            },
+            {
+                "event": "message_stop",
+                "data": json.dumps({"type": "message_stop"})
+            }
+        ]
diff --git a/modules/api/script.py b/modules/api/script.py
index 356919e9..a94247fa 100644
--- a/modules/api/script.py
+++ b/modules/api/script.py
@@ -10,6 +10,7 @@ from threading import Thread
 
 import uvicorn
 from fastapi import Depends, FastAPI, Header, HTTPException
+from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.requests import Request
 from fastapi.responses import JSONResponse
@@ -19,6 +20,7 @@ from starlette.concurrency import iterate_in_threadpool
 import modules.api.completions as OAIcompletions
 import modules.api.logits as OAIlogits
 import modules.api.models as OAImodels
+import modules.api.anthropic as Anthropic
 from .tokens import token_count, token_decode, token_encode
 from .errors import OpenAIError
 from .utils import _start_cloudflared
@@ -28,6 +30,7 @@ from modules.models import unload_model
 from modules.text_generation import stop_everything_event  # used by /v1/internal/stop-generation
 
 from .typing import (
+    AnthropicRequest,
     ChatCompletionRequest,
     ChatCompletionResponse,
     ChatPromptResponse,
@@ -74,9 +77,23 @@ def verify_admin_key(authorization: str = Header(None)) -> None:
         raise HTTPException(status_code=401, detail="Unauthorized")
 
 
+def verify_anthropic_key(x_api_key: str = Header(None, alias="x-api-key")) -> None:
+    expected_api_key = shared.args.api_key
+    if expected_api_key and (x_api_key is None or x_api_key != expected_api_key):
+        raise HTTPException(status_code=401, detail="Unauthorized")
+
+
+class AnthropicError(Exception):
+    def __init__(self, message: str, error_type: str = "invalid_request_error", status_code: int = 400):
+        self.message = message
+        self.error_type = error_type
+        self.status_code = status_code
+
+
 app = FastAPI()
 check_key = [Depends(verify_api_key)]
 check_admin_key = [Depends(verify_admin_key)]
+check_anthropic_key = [Depends(verify_anthropic_key)]
 
 # Configure CORS settings to allow all origins, methods, and headers
 app.add_middleware(
@@ -102,6 +119,28 @@ async def openai_error_handler(request: Request, exc: OpenAIError):
     )
 
 
+@app.exception_handler(AnthropicError)
+async def anthropic_error_handler(request: Request, exc: AnthropicError):
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={"type": "error", "error": {"type": exc.error_type, "message": exc.message}}
+    )
+
+
+@app.exception_handler(RequestValidationError)
+async def validation_error_handler(request: Request, exc: RequestValidationError):
+    if request.url.path.startswith("/v1/messages"):
+        messages = "; ".join(
+            f"{'.'.join(str(l) for l in e['loc'])}: {e['msg']}" for e in exc.errors()
+        )
+        return JSONResponse(
+            status_code=400,
+            content={"type": "error", "error": {"type": "invalid_request_error", "message": messages}}
+        )
+
+    return JSONResponse(status_code=422, content={"detail": exc.errors()})
+
+
 @app.middleware("http")
 async def validate_host_header(request: Request, call_next):
     # Be strict about only approving access to localhost by default
@@ -211,6 +250,76 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
         return JSONResponse(response)
 
 
+@app.post('/v1/messages', dependencies=check_anthropic_key)
+async def anthropic_messages(request: Request, request_data: AnthropicRequest):
+    body = to_dict(request_data)
+    model = body.get('model') or shared.model_name or 'unknown'
+
+    try:
+        converted = Anthropic.convert_request(body)
+    except Exception as e:
+        raise AnthropicError(message=str(e))
+
+    try:
+        return await _anthropic_generate(request, request_data, converted, model)
+    except OpenAIError as e:
+        error_type = "invalid_request_error" if e.code < 500 else "api_error"
+        if e.code == 503:
+            error_type = "overloaded_error"
+        raise AnthropicError(message=e.message, error_type=error_type, status_code=e.code)
+    except Exception as e:
+        raise AnthropicError(message=str(e) or "Internal server error", error_type="api_error", status_code=500)
+
+
+async def _anthropic_generate(request, request_data, converted, model):
+    if request_data.stream:
+        stop_event = threading.Event()
+
+        async def generator():
+            converter = Anthropic.StreamConverter(model)
+            response = OAIcompletions.stream_chat_completions(converted, is_legacy=False, stop_event=stop_event)
+            try:
+                async for resp in iterate_in_threadpool(response):
+                    disconnected = await request.is_disconnected()
+                    if disconnected:
+                        break
+
+                    for event in converter.process_chunk(resp):
+                        yield event
+
+                for event in converter.finish():
+                    yield event
+            except OpenAIError as e:
+                error_type = "invalid_request_error" if e.code < 500 else "api_error"
+                if e.code == 503:
+                    error_type = "overloaded_error"
+                yield {
+                    "event": "error",
+                    "data": json.dumps({"type": "error", "error": {"type": error_type, "message": e.message}})
+                }
+            finally:
+                stop_event.set()
+                response.close()
+
+        return EventSourceResponse(generator(), sep="\n")
+
+    else:
+        stop_event = threading.Event()
+        monitor = asyncio.create_task(_wait_for_disconnect(request, stop_event))
+        try:
+            openai_resp = await asyncio.to_thread(
+                OAIcompletions.chat_completions,
+                converted,
+                is_legacy=False,
+                stop_event=stop_event
+            )
+        finally:
+            stop_event.set()
+            monitor.cancel()
+
+        return JSONResponse(Anthropic.build_response(openai_resp, model))
+
+
 @app.get("/v1/models", dependencies=check_key)
 @app.get("/v1/models/{model}", dependencies=check_key)
 async def handle_models(request: Request):
@@ -469,15 +578,15 @@ def run_server():
             port,
             shared.args.public_api_id,
             max_attempts=3,
-            on_start=lambda url: logger.info(f'OpenAI-compatible API URL:\n\n{url}/v1\n')
+            on_start=lambda url: logger.info(f'API URL (OpenAI + Anthropic compatible):\n\n{url}/v1\n')
         )
     else:
         url_proto = 'https://' if (ssl_certfile and ssl_keyfile) else 'http://'
         urls = [f'{url_proto}{addr}:{port}/v1' for addr in server_addrs]
         if len(urls) > 1:
-            logger.info('OpenAI-compatible API URLs:\n\n' + '\n'.join(urls) + '\n')
+            logger.info('API URLs (OpenAI + Anthropic compatible):\n\n' + '\n'.join(urls) + '\n')
         else:
-            logger.info('OpenAI-compatible API URL:\n\n' + '\n'.join(urls) + '\n')
+            logger.info('API URL (OpenAI + Anthropic compatible):\n\n' + '\n'.join(urls) + '\n')
 
     # Log API keys
     if shared.args.api_key:
diff --git a/modules/api/typing.py b/modules/api/typing.py
index 80831c44..1d486e8f 100644
--- a/modules/api/typing.py
+++ b/modules/api/typing.py
@@ -144,7 +144,7 @@ class CompletionResponse(BaseModel):
 
 
 class ChatCompletionRequestParams(BaseModel):
-    messages: List[dict]
+    messages: List[dict] = Field(..., min_length=1)
     model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
     frequency_penalty: float | None = shared.args.frequency_penalty
     function_call: str | dict | None = Field(default=None, description="Unused parameter.")
@@ -282,6 +282,25 @@ class LoadLorasRequest(BaseModel):
     lora_names: List[str]
 
 
+class AnthropicRequestParams(BaseModel):
+    model: str | None = None
+    messages: List[dict] = Field(..., min_length=1)
+    max_tokens: int
+    system: str | list | None = None
+    temperature: float | None = shared.args.temperature
+    top_p: float | None = shared.args.top_p
+    stop_sequences: list[str] | None = None
+    stream: bool = False
+    tools: list[dict] | None = None
+    tool_choice: dict | None = None
+    thinking: dict | None = None
+    metadata: dict | None = None
+
+
+class AnthropicRequest(GenerationOptions, AnthropicRequestParams):
+    pass
+
+
 class ImageGenerationRequest(BaseModel):
     """Image-specific parameters for generation."""
     prompt: str

From f2c909725ef667821a0e2ef5d68f4a2b86f0fd49 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 21 Mar 2026 11:09:06 -0700
Subject: [PATCH 154/210] API: Use top_p=0.95 by default

---
 modules/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/shared.py b/modules/shared.py
index 69e16960..16ccbe77 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -175,7 +175,7 @@ group.add_argument('--dynatemp-high', type=float, default=_d['dynatemp_high'], m
 group.add_argument('--dynatemp-exponent', type=float, default=_d['dynatemp_exponent'], metavar='N', help='Dynamic temperature exponent')
 group.add_argument('--smoothing-factor', type=float, default=_d['smoothing_factor'], metavar='N', help='Smoothing factor')
 group.add_argument('--smoothing-curve', type=float, default=_d['smoothing_curve'], metavar='N', help='Smoothing curve')
-group.add_argument('--top-p', type=float, default=_d['top_p'], metavar='N', help='Top P')
+group.add_argument('--top-p', type=float, default=0.95, metavar='N', help='Top P')
 group.add_argument('--top-k', type=int, default=_d['top_k'], metavar='N', help='Top K')
 group.add_argument('--min-p', type=float, default=_d['min_p'], metavar='N', help='Min P')
 group.add_argument('--top-n-sigma', type=float, default=_d['top_n_sigma'], metavar='N', help='Top N Sigma')

From 2c4f36433986001f80fdbb0f9095aa68f43274d2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 21 Mar 2026 18:38:11 -0700
Subject: [PATCH 155/210] Update API docs to mention Anthropic support

---
 README.md               | 2 +-
 docs/12 - OpenAI API.md | 4 ++--
 modules/api/script.py   | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index cabb81fc..7e5566ec 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ A Gradio web UI for running Large Language Models locally. 100% private and offl
 - **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
 - **Vision (multimodal)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
 - **Tool-calling**: Models can call custom functions during chat — web search, page fetching, math, and more. Each tool is a single `.py` file, easy to create and extend ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Tool-Calling-Tutorial)).
-- **OpenAI-compatible API**: Chat and Completions endpoints with tool-calling support. Use as a local drop-in replacement for the OpenAI API ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
+- **OpenAI/Anthropic-compatible API**: Chat, Completions, and Messages endpoints with tool-calling support. Use as a local drop-in replacement for the OpenAI/Anthropic APIs ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
 - **Training**: Fine-tune LoRAs on multi-turn chat or raw text datasets. Supports resuming interrupted runs ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/05-%E2%80%90-Training-Tab)).
 - **Image generation**: A dedicated tab for `diffusers` models like **Z-Image-Turbo**. Features 4-bit/8-bit quantization and a persistent gallery with metadata ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Image-Generation-Tutorial)).
 - **Easy setup**: [Portable builds](https://github.com/oobabooga/text-generation-webui/releases) (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or a one-click installer for the full feature set.
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 276a7e19..2a7a7f69 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -1,6 +1,6 @@
-## OpenAI compatible API
+## OpenAI/Anthropic-compatible API
 
-The main API for this project is meant to be a drop-in replacement to the OpenAI API, including Chat and Completions endpoints.
+The main API for this project is meant to be a drop-in replacement for the OpenAI and Anthropic APIs, including Chat, Completions, and Messages endpoints.
 
 * It is 100% offline and private.
 * It doesn't create any logs.
diff --git a/modules/api/script.py b/modules/api/script.py
index a94247fa..5913c2c5 100644
--- a/modules/api/script.py
+++ b/modules/api/script.py
@@ -578,15 +578,15 @@ def run_server():
             port,
             shared.args.public_api_id,
             max_attempts=3,
-            on_start=lambda url: logger.info(f'API URL (OpenAI + Anthropic compatible):\n\n{url}/v1\n')
+            on_start=lambda url: logger.info(f'OpenAI/Anthropic-compatible API URL:\n\n{url}/v1\n')
         )
     else:
         url_proto = 'https://' if (ssl_certfile and ssl_keyfile) else 'http://'
         urls = [f'{url_proto}{addr}:{port}/v1' for addr in server_addrs]
         if len(urls) > 1:
-            logger.info('API URLs (OpenAI + Anthropic compatible):\n\n' + '\n'.join(urls) + '\n')
+            logger.info('OpenAI/Anthropic-compatible API URLs:\n\n' + '\n'.join(urls) + '\n')
         else:
-            logger.info('API URL (OpenAI + Anthropic compatible):\n\n' + '\n'.join(urls) + '\n')
+            logger.info('OpenAI/Anthropic-compatible API URL:\n\n' + '\n'.join(urls) + '\n')
 
     # Log API keys
     if shared.args.api_key:

From 9488df3e489c97cc26018d9ae1dc6a4bc0384f1b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:47:26 -0700
Subject: [PATCH 156/210] llama.cpp: Don't suppress llama-server logs

---
 modules/llama_cpp_server.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2ae01ddc..b77a8605 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -588,8 +588,11 @@ def filter_stderr_with_progress(process_stderr):
                             print(display_line, end=end_char, file=sys.stderr, flush=True)
                             last_was_progress = (progress < 1.0)
 
-                        # skip noise lines
-                        elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line or 'No parser definition detected' in line):
+                        # skip health check polling and parser warnings
+                        elif 'log_server_r: request: GET /health' in line or 'No parser definition detected' in line:
+                            continue
+
+                        else:
                             # if we were in progress, finish that line first
                             if last_was_progress:
                                 print(file=sys.stderr)

From 1dda5e47111eaf8cb90f25ffb94e47296def5c8f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:58:45 -0700
Subject: [PATCH 157/210] Follow-up to previous commit

---
 modules/llama_cpp_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index b77a8605..5cbf2122 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -588,8 +588,8 @@ def filter_stderr_with_progress(process_stderr):
                             print(display_line, end=end_char, file=sys.stderr, flush=True)
                             last_was_progress = (progress < 1.0)
 
-                        # skip health check polling and parser warnings
-                        elif 'log_server_r: request: GET /health' in line or 'No parser definition detected' in line:
+                        # skip noise lines
+                        elif 'log_server_r: request: GET /health' in line or 'No parser definition detected' in line or (last_was_progress and ('memory_seq_rm' in line or 'context checkpoint' in line)):
                             continue
 
                         else:

From bde496ea5daf9f7fa9a0ac90f8f8f25166738112 Mon Sep 17 00:00:00 2001
From: Phrosty1 <istas.phrost@gmail.com>
Date: Sun, 22 Mar 2026 20:48:56 -0400
Subject: [PATCH 158/210] Fix prompt corruption when continuing with context
 truncation (#7439)

---
 modules/chat.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index 148d559a..f8088e0f 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -434,6 +434,8 @@ def generate_chat_prompt(user_input, state, **kwargs):
         messages.append({"role": "user", "content": "fake user message replace me"})
 
     def make_prompt(messages):
+        if _continue:
+            messages = copy.deepcopy(messages)
         last_message = messages[-1].copy()
         if _continue:
             if state['mode'] == 'chat-instruct':

From 9ec20d9730db3f41270da12f51f7ce138fb8705c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 22 Mar 2026 19:16:24 -0700
Subject: [PATCH 159/210] Strip thinking blocks before tool-call parsing

---
 modules/tool_parsing.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
index 7a7ed5d8..ec49f77f 100644
--- a/modules/tool_parsing.py
+++ b/modules/tool_parsing.py
@@ -2,6 +2,8 @@ import json
 import random
 import re
 
+from modules.reasoning import extract_reasoning
+
 
 def _make_tool_call(name, arguments):
     return {"type": "function", "function": {"name": name, "arguments": arguments}}
@@ -41,6 +43,10 @@ def streaming_tool_buffer_check(text, markers=None, tool_names=None, check_bare_
         check_bare_names: Whether to do partial-prefix matching on tool
                           names (for models with unknown template format).
     '''
+    # Strip thinking blocks so tool-call syntax inside <think> doesn't
+    # trigger false positives.
+    _, text = extract_reasoning(text)
+
     # Full marker found in text → buffer permanently.
     # Always checks ALL known markers regardless of template (cheap safety net).
     for marker in TOOL_CALL_OPENING_MARKERS:
@@ -543,12 +549,19 @@ def detect_tool_call_format(template_str):
 
 
 def parse_tool_call(answer: str, tool_names: list[str], return_prefix: bool = False, parsers: list = None):
+    # Strip thinking blocks so tool-call syntax inside <think> is ignored.
+    original_answer = answer
+    _, answer = extract_reasoning(answer)
+    # Offset between original and stripped text, used to map start_pos
+    # back to the original string when returning a prefix.
+    reasoning_offset = len(original_answer) - len(answer)
+
     matches = []
     start_pos = None
 
     def _return(matches, start_pos):
         if return_prefix:
-            prefix = answer[:start_pos] if matches and start_pos is not None else ''
+            prefix = original_answer[:start_pos + reasoning_offset] if matches and start_pos is not None else ''
             return matches, prefix
         return matches
 

From 307d0c92be2a4f8ac97f2be6c2cc3af1b9c8ad6f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 23 Mar 2026 06:35:14 -0700
Subject: [PATCH 160/210] UI polish

---
 css/main.css  | 8 ++++----
 modules/ui.py | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/css/main.css b/css/main.css
index 22fac5c5..a8c30a3f 100644
--- a/css/main.css
+++ b/css/main.css
@@ -54,7 +54,7 @@ div.svelte-iyf88w {
     height: 39.594px;
     align-self: end;
     line-height: 1em;
-    border-radius: 0.375rem;
+    border-radius: 0.75rem;
     flex: none;
 }
 
@@ -1420,7 +1420,7 @@ audio {
 }
 
 .dark .thinking-block {
-    background-color: var(--darker-gray);
+    background-color: var(--selected-item-color-dark);
     border: 1px solid var(--input-border-color);
 }
 
@@ -1558,7 +1558,7 @@ strong {
     min-height: 200px;
     max-height: 65vh;
     padding: 10px;
-    border-radius: 5px;
+    border-radius: 0.5rem;
     border: 1px solid #ccc;
     background-color: var(--light-theme-gray);
     font-family: inherit;
@@ -1586,7 +1586,7 @@ strong {
 .edit-control-button {
     padding: 6px 12px;
     border: 1px solid #ccc;
-    border-radius: 4px;
+    border-radius: 0.75rem;
     cursor: pointer;
     background-color: #f8f9fa;
     color: #212529;
diff --git a/modules/ui.py b/modules/ui.py
index 20bc8373..02b5a9fb 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -115,6 +115,7 @@ if not shared.args.old_colors:
         input_shadow_focus='none',
         input_shadow_focus_dark='none',
         button_large_radius='0.75rem',
+        button_small_radius='0.75rem',
         button_large_padding='6px 12px',
         input_radius='0.5rem',
         block_radius='0.375rem',

From 02f18a1d65881cb3ed291050a191d8cf712b7115 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 23 Mar 2026 07:06:38 -0700
Subject: [PATCH 161/210] API: Add thinking block signature field, fix error
 codes, clean up logging

---
 modules/api/anthropic.py   | 2 +-
 modules/api/embeddings.py  | 4 ++--
 modules/api/moderations.py | 2 --
 modules/api/script.py      | 9 +++++++--
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/modules/api/anthropic.py b/modules/api/anthropic.py
index 5fbf5caf..3fab09a6 100644
--- a/modules/api/anthropic.py
+++ b/modules/api/anthropic.py
@@ -326,7 +326,7 @@ class StreamConverter:
                     "data": json.dumps({
                         "type": "content_block_start",
                         "index": self.block_index,
-                        "content_block": {"type": "thinking", "thinking": ""}
+                        "content_block": {"type": "thinking", "thinking": "", "signature": ""}
                     })
                 })
             events.append({
diff --git a/modules/api/embeddings.py b/modules/api/embeddings.py
index ad299c9d..16cf0482 100644
--- a/modules/api/embeddings.py
+++ b/modules/api/embeddings.py
@@ -39,14 +39,14 @@ def load_embedding_model(model: str):
     initialize_embedding_params()
     global embeddings_device, embeddings_model
     try:
-        print(f"Try embedding model: {model} on {embeddings_device}")
+        logger.info(f"Try embedding model: {model} on {embeddings_device}")
         if 'jina-embeddings' in model:
             embeddings_model = AutoModel.from_pretrained(model, trust_remote_code=True)  # trust_remote_code is needed to use the encode method
             embeddings_model = embeddings_model.to(embeddings_device)
         else:
             embeddings_model = SentenceTransformer(model, device=embeddings_device)
 
-        print(f"Loaded embedding model: {model}")
+        logger.info(f"Loaded embedding model: {model}")
     except Exception as e:
         embeddings_model = None
         raise ServiceUnavailableError(f"Error: Failed to load embedding model: {model}", internal_message=repr(e))
diff --git a/modules/api/moderations.py b/modules/api/moderations.py
index ac0539d6..a41763cf 100644
--- a/modules/api/moderations.py
+++ b/modules/api/moderations.py
@@ -64,6 +64,4 @@ def moderations(input):
                 'category_scores': category_scores,
             }])
 
-    print(results)
-
     return results
diff --git a/modules/api/script.py b/modules/api/script.py
index 5913c2c5..85f4974f 100644
--- a/modules/api/script.py
+++ b/modules/api/script.py
@@ -506,12 +506,17 @@ async def handle_load_model(request_data: LoadModelRequest):
         return JSONResponse(content="OK")
     except Exception:
         traceback.print_exc()
-        raise HTTPException(status_code=400, detail="Failed to load the model.")
+        raise HTTPException(status_code=500, detail="Failed to load the model.")
 
 
 @app.post("/v1/internal/model/unload", dependencies=check_admin_key)
 async def handle_unload_model():
-    unload_model()
+    try:
+        unload_model()
+        return JSONResponse(content="OK")
+    except Exception:
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail="Failed to unload the model.")
 
 
 @app.get("/v1/internal/lora/list", response_model=LoraListResponse, dependencies=check_admin_key)

From 286bbb685d7bc585b8d82fd0e8d23515aeff9cb0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 23 Mar 2026 20:22:46 -0700
Subject: [PATCH 162/210] Revert "Follow-up to previous commit"

This reverts commit 1dda5e47111eaf8cb90f25ffb94e47296def5c8f.
---
 modules/llama_cpp_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 5cbf2122..b77a8605 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -588,8 +588,8 @@ def filter_stderr_with_progress(process_stderr):
                             print(display_line, end=end_char, file=sys.stderr, flush=True)
                             last_was_progress = (progress < 1.0)
 
-                        # skip noise lines
-                        elif 'log_server_r: request: GET /health' in line or 'No parser definition detected' in line or (last_was_progress and ('memory_seq_rm' in line or 'context checkpoint' in line)):
+                        # skip health check polling and parser warnings
+                        elif 'log_server_r: request: GET /health' in line or 'No parser definition detected' in line:
                             continue
 
                         else:

From a7ef430b38c2f6e7c9a043b2f94ec6c2108d1480 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 23 Mar 2026 20:22:51 -0700
Subject: [PATCH 163/210] Revert "llama.cpp: Don't suppress llama-server logs"

This reverts commit 9488df3e489c97cc26018d9ae1dc6a4bc0384f1b.
---
 modules/llama_cpp_server.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index b77a8605..2ae01ddc 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -588,11 +588,8 @@ def filter_stderr_with_progress(process_stderr):
                             print(display_line, end=end_char, file=sys.stderr, flush=True)
                             last_was_progress = (progress < 1.0)
 
-                        # skip health check polling and parser warnings
-                        elif 'log_server_r: request: GET /health' in line or 'No parser definition detected' in line:
-                            continue
-
-                        else:
+                        # skip noise lines
+                        elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line or 'No parser definition detected' in line):
                             # if we were in progress, finish that line first
                             if last_was_progress:
                                 print(file=sys.stderr)

From c9d2240f5045baed0f234f3937614bdbe63af340 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 06:45:39 -0700
Subject: [PATCH 164/210] Update README

---
 README.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 7e5566ec..ab6cc2e5 100644
--- a/README.md
+++ b/README.md
@@ -23,21 +23,20 @@ A Gradio web UI for running Large Language Models locally. 100% private and offl
 
 ## Features
 
+- **Easy setup**: [Portable builds](https://github.com/oobabooga/text-generation-webui/releases) (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or a one-click installer for the full feature set.
 - **Multiple backends**: [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Switch between backends and models without restarting.
-- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
-- **Vision (multimodal)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
-- **Tool-calling**: Models can call custom functions during chat — web search, page fetching, math, and more. Each tool is a single `.py` file, easy to create and extend ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Tool-Calling-Tutorial)).
 - **OpenAI/Anthropic-compatible API**: Chat, Completions, and Messages endpoints with tool-calling support. Use as a local drop-in replacement for the OpenAI/Anthropic APIs ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
+- **Tool-calling**: Models can call custom functions during chat — web search, page fetching, math, and more. Each tool is a single `.py` file, easy to create and extend ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Tool-Calling-Tutorial)).
+- **Vision (multimodal)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
+- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
 - **Training**: Fine-tune LoRAs on multi-turn chat or raw text datasets. Supports resuming interrupted runs ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/05-%E2%80%90-Training-Tab)).
 - **Image generation**: A dedicated tab for `diffusers` models like **Z-Image-Turbo**. Features 4-bit/8-bit quantization and a persistent gallery with metadata ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Image-Generation-Tutorial)).
-- **Easy setup**: [Portable builds](https://github.com/oobabooga/text-generation-webui/releases) (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or a one-click installer for the full feature set.
 - 100% offline and private, with zero telemetry, external resources, or remote update requests.
 - `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters. Prompts are automatically formatted with Jinja2 templates.
 - Edit messages, navigate between message versions, and branch conversations at any point.
 - Free-form text generation in the Notebook tab without being limited to chat turns.
 - Multiple sampling parameters and generation options for sophisticated text generation control.
-- Aesthetic UI with dark and light themes.
-- Syntax highlighting for code blocks and LaTeX rendering for mathematical expressions.
+- Dark/light themes, syntax highlighting for code blocks, and LaTeX rendering for mathematical expressions.
 - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 
 ## How to install
@@ -429,7 +428,7 @@ API generation defaults:
 
 That's it. The UI will detect it automatically.
 
-To check what will fit your GPU, you can use the [VRAM Calculator](https://huggingface.co/spaces/oobabooga/accurate-gguf-vram-calculator).
+To estimate how much memory a model will use, you can use the [GGUF Memory Calculator](https://huggingface.co/spaces/oobabooga/accurate-gguf-vram-calculator).
 
 <details>
 <summary>Other model types (Transformers, EXL3)</summary>

From 5b8da154b7aa4475718b819abba8acc1354e34eb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 09:34:59 -0700
Subject: [PATCH 165/210] Update llama.cpp

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 12 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index ad68ad59..56619627 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index b11e50b7..620683cc 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index d147af3f..b1f109b2 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d284c5d5..a54476a9 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 3952054e..be82c904 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index abf7690c..188da380 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 0d66c16c..4562b6d0 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 0658239a..04dcf25e 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index b66e2b38..4b8af78a 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index bb815bb2..5b0eaf89 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index d57ba40b..90b3234f 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 6abd8920..ea72b4ec 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.94.0/llama_cpp_binaries-0.94.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 5814e745be03d1f6f4cc6614e7a10d45282024b8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 11:14:22 -0700
Subject: [PATCH 166/210] UI: Minor polish

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index a8c30a3f..d42bd6ae 100644
--- a/css/main.css
+++ b/css/main.css
@@ -582,7 +582,7 @@ audio {
 
 #chat-input textarea {
     background: #f3f4f6;
-    padding: 0.65rem 2.5rem 0.6rem;
+    padding: 0.675rem 2.5rem 0.6rem;
     margin-top: 0.15rem;
     border: 1px solid #d2d2d8;
     border-radius: 1.5rem;

From 750502695c4339dc525d50cf428960d7ffbeeb05 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 11:39:24 -0700
Subject: [PATCH 167/210] Fix GPT-OSS tool-calling after 9ec20d97

---
 modules/reasoning.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/modules/reasoning.py b/modules/reasoning.py
index 9c92719b..aa1939b8 100644
--- a/modules/reasoning.py
+++ b/modules/reasoning.py
@@ -72,10 +72,9 @@ def extract_reasoning(text, html_escaped=False):
                 if content_pos != -1:
                     content_start = content_pos + len(content_esc)
                 else:
-                    # Content tag expected but not yet present (e.g. partial
-                    # streaming) — suppress intermediate tags between end_tag
-                    # and content_tag so they don't leak as content.
-                    content_start = len(text)
+                    # Content tag not present — fall back to content after
+                    # end_tag (e.g. GPT-OSS tool calls skip the final channel).
+                    content_start = end_pos + len(end_esc)
             else:
                 content_start = end_pos + len(end_esc)
 

From f48a2b79d022a3f503085d6daeb3706b3b6dc2e0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 11:45:33 -0700
Subject: [PATCH 168/210] UI: Minor polish

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index d42bd6ae..009b7c0a 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1420,7 +1420,7 @@ audio {
 }
 
 .dark .thinking-block {
-    background-color: var(--selected-item-color-dark);
+    background-color: transparent;
     border: 1px solid var(--input-border-color);
 }
 

From 807be1183272fac409ce8f08609dbdd0d9f63362 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 18:48:50 -0700
Subject: [PATCH 169/210] Remove obsolete models/config.yaml and related code

---
 docs/01 - Chat Tab.md        |   2 +-
 docs/12 - OpenAI API.md      |   2 +-
 modules/models.py            |   1 -
 modules/models_settings.py   |   9 +-
 modules/shared.py            |  10 --
 server.py                    |   5 -
 user_data/models/config.yaml | 203 -----------------------------------
 7 files changed, 4 insertions(+), 228 deletions(-)
 delete mode 100644 user_data/models/config.yaml

diff --git a/docs/01 - Chat Tab.md b/docs/01 - Chat Tab.md
index 5104895f..96b232fa 100644
--- a/docs/01 - Chat Tab.md	
+++ b/docs/01 - Chat Tab.md	
@@ -112,7 +112,7 @@ Used for talking to an instruction-following model using the prompt format defin
 
 The prompt format is defined by the **Instruction template** parameter in "Parameters" > "Instruction template", which represents a Jinja2 template.
 
-Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any), and will update the values under "Parameters" > "Instruction template" accordingly. This is done using a set of regular expressions defined in `user_data/models/config.yaml`. This detection is not guaranteed to be accurate. You should check the model card on Hugging Face to see if you are using the correct prompt format.
+Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any) from the model metadata (e.g. `tokenizer_config.json` or GGUF metadata), and will update the values under "Parameters" > "Instruction template" accordingly. You should check the model card on Hugging Face to see if you are using the correct prompt format.
 
 ### Chat-instruct
 
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 2a7a7f69..0a076c35 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -39,7 +39,7 @@ curl http://127.0.0.1:5000/v1/completions \
 
 #### Chat completions
 
-Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be guessed automatically based on the model name using the regex patterns in `user_data/models/config.yaml`.
+Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be detected automatically from the model metadata.
 
 ```shell
 curl http://127.0.0.1:5000/v1/chat/completions \
diff --git a/modules/models.py b/modules/models.py
index 1d139b89..b2665c6b 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -67,7 +67,6 @@ def load_model(model_name, loader=None):
     logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
     logger.info(f"LOADER: \"{loader}\"")
     logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
-    logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"")
     return model, tokenizer
 
 
diff --git a/modules/models_settings.py b/modules/models_settings.py
index dcface71..eafa0581 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -23,14 +23,9 @@ def get_fallback_settings():
 
 def get_model_metadata(model):
     model_path = resolve_model_path(model)
-    model_settings = {}
 
-    # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
-    settings = shared.model_config
-    for pat in settings:
-        if re.match(pat.lower(), Path(model).name.lower()):
-            for k in settings[pat]:
-                model_settings[k] = settings[pat][k]
+    # Fallback settings
+    model_settings = get_fallback_settings()
 
     path = model_path / 'config.json'
     if path.exists():
diff --git a/modules/shared.py b/modules/shared.py
index 16ccbe77..acb103b4 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -454,17 +454,7 @@ def load_user_config():
 
 args.loader = fix_loader_name(args.loader)
 
-# Load model-specific settings
-p = Path(f'{args.model_dir}/config.yaml')
-if p.exists():
-    model_config = yaml.safe_load(open(p, 'r').read())
-else:
-    model_config = {}
-del p
-
-
 # Load custom model-specific settings
 user_config = load_user_config()
 
-model_config = OrderedDict(model_config)
 user_config = OrderedDict(user_config)
diff --git a/server.py b/server.py
index d224909c..88936ca6 100644
--- a/server.py
+++ b/server.py
@@ -18,7 +18,6 @@ import modules.extensions as extensions_module
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model_if_idle
 from modules.models_settings import (
-    get_fallback_settings,
     get_model_metadata,
     update_model_parameters
 )
@@ -271,10 +270,6 @@ if __name__ == "__main__":
     # Apply CLI overrides for image model settings (CLI flags take precedence over saved settings)
     shared.apply_image_model_cli_overrides()
 
-    # Fallback settings for models
-    shared.model_config['.*'] = get_fallback_settings()
-    shared.model_config.move_to_end('.*', last=False)  # Move to the beginning
-
     # Activate the extensions listed on settings.yaml
     extensions_module.available_extensions = utils.get_available_extensions()
     for extension in shared.settings['default_extensions']:
diff --git a/user_data/models/config.yaml b/user_data/models/config.yaml
deleted file mode 100644
index 038ebcf1..00000000
--- a/user_data/models/config.yaml
+++ /dev/null
@@ -1,203 +0,0 @@
-.*(llama|alpac|vicuna|guanaco|koala|llava|wizardlm|metharme|pygmalion-7b|pygmalion-2|mythalion|wizard-mega|openbuddy|vigogne|h2ogpt-research|manticore):
-  model_type: 'llama'
-.*(opt-|opt_|opt1|opt3|optfor|galactica|galpaca|pygmalion-350m):
-  model_type: 'opt'
-.*(gpt-j|gptj|gpt4all-j|malion-6b|pygway|pygmalion-6b|dolly-v1):
-  model_type: 'gptj'
-.*(gpt-neox|koalpaca-polyglot|polyglot.*koalpaca|polyglot-ko|polyglot_ko|pythia|stablelm|incite|dolly-v2|polycoder|h2ogpt-oig|h2ogpt-oasst1|h2ogpt-gm):
-  model_type: 'gptneox'
-.*bloom:
-  model_type: 'bloom'
-.*gpt2:
-  model_type: 'gpt2'
-.*falcon:
-  model_type: 'falcon'
-.*mpt:
-  model_type: 'mpt'
-.*(starcoder|starchat):
-  model_type: 'starcoder'
-.*dolly-v2:
-  model_type: 'dollyv2'
-.*replit:
-  model_type: 'replit'
-.*(oasst|openassistant-|stablelm-7b-sft-v7-epoch-3):
-  instruction_template: 'Open Assistant'
-  skip_special_tokens: false
-(?!.*galactica)(?!.*reward).*openassistant:
-  instruction_template: 'Open Assistant'
-  skip_special_tokens: false
-.*galactica:
-  skip_special_tokens: false
-.*dolly-v[0-9]-[0-9]*b:
-  instruction_template: 'Alpaca'
-  skip_special_tokens: false
-.*alpaca-native-4bit:
-  instruction_template: 'Alpaca'
-.*llava:
-  instruction_template: 'LLaVA'
-.*llava.*1.5:
-  instruction_template: 'Vicuna-v1.1'
-.*wizard.*mega:
-  instruction_template: 'Wizard-Mega'
-.*starchat-beta:
-  instruction_template: 'Starchat-Beta'
-(?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna:
-  instruction_template: 'Vicuna-v0'
-.*vicuna.*v0:
-  instruction_template: 'Vicuna-v0'
-.*vicuna.*(1.1|1_1|1.3|1_3):
-  instruction_template: 'Vicuna-v1.1'
-.*vicuna.*(1.5|1_5):
-  instruction_template: 'Vicuna-v1.1'
-.*stable.*vicuna:
-  instruction_template: 'StableVicuna'
-(?!.*chat).*chinese-vicuna:
-  instruction_template: 'Alpaca'
-.*chinese-vicuna.*chat:
-  instruction_template: 'Chinese-Vicuna-Chat'
-.*alpaca:
-  instruction_template: 'Alpaca'
-.*koala:
-  instruction_template: 'Koala'
-.*chatglm:
-  instruction_template: 'ChatGLM'
-.*(metharme|pygmalion|mythalion):
-  instruction_template: 'Metharme'
-.*raven:
-  instruction_template: 'RWKV-Raven'
-.*moss-moon.*sft:
-  instruction_template: 'MOSS'
-.*stablelm-tuned:
-  instruction_template: 'StableLM'
-.*galactica.*finetuned:
-  instruction_template: 'Galactica Finetuned'
-.*galactica.*-v2:
-  instruction_template: 'Galactica v2'
-(?!.*finetuned)(?!.*-v2).*galactica:
-  instruction_template: 'Galactica'
-.*guanaco:
-  instruction_template: 'Guanaco non-chat'
-.*baize:
-  instruction_template: 'Baize'
-.*mpt-.*instruct:
-  instruction_template: 'Alpaca'
-.*mpt-.*chat:
-  instruction_template: 'ChatML'
-(?!.*-flan-)(?!.*-t5-).*lamini-:
-  instruction_template: 'Alpaca'
-.*incite.*chat:
-  instruction_template: 'INCITE-Chat'
-.*incite.*instruct:
-  instruction_template: 'INCITE-Instruct'
-.*ziya-:
-  instruction_template: 'Ziya'
-.*koalpaca:
-  instruction_template: 'KoAlpaca'
-.*openbuddy:
-  instruction_template: 'OpenBuddy'
-(?!.*chat).*vigogne:
-  instruction_template: 'Vigogne-Instruct'
-.*vigogne.*chat:
-  instruction_template: 'Vigogne-Chat'
-.*(llama-deus|supercot|llama-natural-instructions|open-llama-0.3t-7b-instruct-dolly-hhrlhf|open-llama-0.3t-7b-open-instruct):
-  instruction_template: 'Alpaca'
-.*bactrian:
-  instruction_template: 'Bactrian'
-.*(h2ogpt-oig-|h2ogpt-oasst1-|h2ogpt-research-oasst1-):
-  instruction_template: 'INCITE-Chat'
-.*h2ogpt-gm-:
-  instruction_template: 'H2O-prompt_answer'
-.*manticore:
-  instruction_template: 'Manticore Chat'
-.*bluemoonrp-(30|13)b:
-  instruction_template: 'Bluemoon'
-.*Nous-Hermes-13b:
-  instruction_template: 'Alpaca'
-.*airoboros:
-  instruction_template: 'Vicuna-v1.1'
-.*airoboros.*1.2:
-  instruction_template: 'Airoboros-v1.2'
-.*alpa(cino|sta):
-  instruction_template: 'Alpaca'
-.*hippogriff:
-  instruction_template: 'Hippogriff'
-.*lazarus:
-  instruction_template: 'Alpaca'
-.*guanaco-.*(7|13|33|65)b:
-  instruction_template: 'Vicuna-v0'
-.*hypermantis:
-  instruction_template: 'Alpaca'
-.*open-llama-.*-open-instruct:
-  instruction_template: 'Alpaca'
-.*starcoder-gpteacher-code-instruct:
-  instruction_template: 'Alpaca'
-.*tulu:
-  instruction_template: 'Tulu'
-.*chronos:
-  instruction_template: 'Alpaca'
-.*samantha:
-  instruction_template: 'Samantha'
-.*wizardcoder:
-  instruction_template: 'Alpaca'
-.*minotaur:
-  instruction_template: 'Manticore Chat'
-.*orca_mini:
-  instruction_template: 'Orca Mini'
-.*(platypus|gplatty|superplatty):
-  instruction_template: 'Alpaca'
-.*(openorca-platypus2):
-  instruction_template: 'OpenOrca-Platypus2'
-.*longchat:
-  instruction_template: 'Vicuna-v1.1'
-.*vicuna-33b:
-  instruction_template: 'Vicuna-v1.1'
-.*redmond-hermes-coder:
-  instruction_template: 'Alpaca'
-.*wizardcoder-15b:
-  instruction_template: 'Alpaca'
-.*wizardlm:
-  instruction_template: 'Vicuna-v1.1'
-.*godzilla:
-  instruction_template: 'Alpaca'
-.*llama(-?)(2|v2).*chat:
-  instruction_template: 'Llama-v2'
-.*newhope:
-  instruction_template: 'NewHope'
-.*stablebeluga2:
-  instruction_template: 'StableBeluga2'
-.*openchat:
-  instruction_template: 'OpenChat'
-.*codellama.*instruct:
-  instruction_template: 'Llama-v2'
-.*(mistral|mixtral).*instruct:
-  instruction_template: 'Mistral'
-.*mistral.*openorca:
-  instruction_template: 'ChatML'
-.*(WizardCoder-Python-34B-V1.0|Phind-CodeLlama-34B-v2|CodeBooga-34B-v0.1):
-  instruction_template: 'Alpaca'
-.*orca-2-(13|7)b:
-  instruction_template: 'ChatML'
-.*openhermes.*mistral:
-  instruction_template: 'ChatML'
-.*Yi-34B-Chat:
-  instruction_template: 'ChatML'
-(dolphin).*:
-  instruction_template: 'ChatML'
-.*synthia:
-  instruction_template: 'Synthia'
-.*(hercules|hyperion):
-  instruction_template: 'ChatML'
-.*command-r:
-  instruction_template: 'Command-R'
-.*xwin-lm-70b-v0.1:
-  instruction_template: 'Vicuna-v1.1'
-.*platypus-yi-34b:
-  instruction_template: 'Vicuna-v1.1'
-.*CausalLM-RP-34B:
-  instruction_template: 'ChatML'
-34b-beta:
-  instruction_template: 'ChatML'
-.*airoboros-3_1-yi-34b-200k:
-  instruction_template: 'Llama-v2'
-.*chatqa:
-  instruction_template: 'NVIDIA-ChatQA'

From d6f1485dd189494f6fbe5b6ea7ebd5cc0404233a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 21:45:11 -0700
Subject: [PATCH 170/210] UI: Update the enable_thinking info message

---
 modules/ui_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index f1dc7883..10d05f65 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -82,7 +82,7 @@ def create_ui():
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
                 shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
-                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.')
+                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='For models with thinking support.')
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 

From 368f37335f634ba001d00d2841902de85c7b48db Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 25 Mar 2026 06:37:45 -0700
Subject: [PATCH 171/210] Fix --idle-timeout issues with encode/decode and
 parallel generation

---
 modules/logits.py          |  4 +---
 modules/models.py          | 15 ++++++++++++++-
 modules/text_generation.py | 18 +++++++++++++-----
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/modules/logits.py b/modules/logits.py
index 1f878f27..473f5890 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -4,7 +4,6 @@ import numpy as np
 
 from modules import models, shared
 from modules.logging_colors import logger
-from modules.models import load_model
 from modules.text_generation import generate_reply
 from modules.utils import check_model_loaded
 
@@ -12,8 +11,7 @@ global_scores = None
 
 
 def get_next_logits(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+    models.load_model_if_idle_unloaded()
 
     needs_lock = not args[2]  # use_samplers
     if needs_lock:
diff --git a/modules/models.py b/modules/models.py
index b2665c6b..61ca3838 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,4 +1,5 @@
 import sys
+import threading
 import time
 
 import modules.shared as shared
@@ -7,6 +8,15 @@ from modules.models_settings import get_model_metadata
 from modules.utils import resolve_model_path
 
 last_generation_time = time.time()
+active_generation_count = 0
+_generation_count_lock = threading.Lock()
+
+
+def load_model_if_idle_unloaded():
+    global last_generation_time
+    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
+        shared.model, shared.tokenizer = load_model(shared.model_name)
+        last_generation_time = time.time()
 
 
 def load_model(model_name, loader=None):
@@ -158,7 +168,10 @@ def unload_model_if_idle():
     while True:
         shared.generation_lock.acquire()
         try:
-            if time.time() - last_generation_time > shared.args.idle_timeout * 60:
+            with _generation_count_lock:
+                is_active = active_generation_count > 0
+
+            if not is_active and time.time() - last_generation_time > shared.args.idle_timeout * 60:
                 if shared.model is not None:
                     logger.info("Unloading the model for inactivity.")
                     unload_model(keep_model_name=True)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f77be124..3a9ddab5 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -17,9 +17,7 @@ from modules.utils import check_model_loaded
 
 
 def generate_reply(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
-        from modules.models import load_model
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+    models.load_model_if_idle_unloaded()
 
     state = args[1] if len(args) > 1 else kwargs.get('state', {})
     use_parallel = (
@@ -31,10 +29,16 @@ def generate_reply(*args, **kwargs):
     if not use_parallel:
         shared.generation_lock.acquire()
 
+    with models._generation_count_lock:
+        models.active_generation_count += 1
+
     try:
         for result in _generate_reply(*args, **kwargs):
             yield result
     finally:
+        with models._generation_count_lock:
+            models.active_generation_count -= 1
+
         models.last_generation_time = time.time()
         if not use_parallel:
             shared.generation_lock.release()
@@ -126,7 +130,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
 
 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
     if shared.tokenizer is None:
-        raise ValueError('No tokenizer is loaded')
+        models.load_model_if_idle_unloaded()
+        if shared.tokenizer is None:
+            raise ValueError('No tokenizer is loaded')
 
     # llama.cpp case
     if shared.model.__class__.__name__ == 'LlamaServer':
@@ -176,7 +182,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
 
 def decode(output_ids, skip_special_tokens=True):
     if shared.tokenizer is None:
-        raise ValueError('No tokenizer is loaded')
+        models.load_model_if_idle_unloaded()
+        if shared.tokenizer is None:
+            raise ValueError('No tokenizer is loaded')
 
     return shared.tokenizer.decode(output_ids, skip_special_tokens=skip_special_tokens)
 

From e1541400219043f9b9cebf5f002b48251efc8bf9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 25 Mar 2026 07:21:02 -0700
Subject: [PATCH 172/210] Rename "truncation length" to "context length" in
 logs

---
 modules/api/models.py | 2 +-
 modules/models.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/api/models.py b/modules/api/models.py
index c879a860..b89397d3 100644
--- a/modules/api/models.py
+++ b/modules/api/models.py
@@ -68,7 +68,7 @@ def _load_model(data):
             if k in shared.settings:
                 shared.settings[k] = settings[k]
                 if k == 'truncation_length':
-                    logger.info(f"TRUNCATION LENGTH (UPDATED): {shared.settings['truncation_length']}")
+                    logger.info(f"CONTEXT LENGTH (UPDATED): {shared.settings['truncation_length']}")
                 elif k == 'instruction_template':
                     logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}")
 
diff --git a/modules/models.py b/modules/models.py
index 61ca3838..e997d2d8 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -76,7 +76,7 @@ def load_model(model_name, loader=None):
 
     logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
     logger.info(f"LOADER: \"{loader}\"")
-    logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
+    logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}")
     return model, tokenizer
 
 

From 4cbea02ed4e0dee2efd066ac48bcdf33631b9eca Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 26 Mar 2026 06:49:39 -0700
Subject: [PATCH 173/210] Add ik_llama.cpp support via `--ik` flag

---
 modules/llama_cpp_server.py | 37 +++++++++++++++++++++++++++++++++++++
 modules/shared.py           |  1 +
 2 files changed, 38 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2ae01ddc..9b9756a9 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -470,6 +470,10 @@ class LlamaServer:
                         else:
                             cmd.append(f"--{flag_item}")
 
+        # Patch flags for ik_llama.cpp compatibility
+        if shared.args.ik:
+            cmd = _patch_cmd_for_ik(cmd)
+
         env = os.environ.copy()
         if os.name == 'posix':
             current_path = env.get('LD_LIBRARY_PATH', '')
@@ -607,3 +611,36 @@ def filter_stderr_with_progress(process_stderr):
             process_stderr.close()
         except Exception:
             pass
+
+
+def _patch_cmd_for_ik(cmd):
+    """
+    Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
+      --no-webui          → --webui none
+      --fit off            → (removed)
+      --fit on / --fit-ctx → --fit (bare flag)
+      --fit-target         → --fit-margin
+    """
+    patched = []
+    i = 0
+    while i < len(cmd):
+        arg = cmd[i]
+
+        if arg == "--no-webui":
+            patched += ["--webui", "none"]
+        elif arg == "--fit" and i + 1 < len(cmd) and cmd[i + 1] in ("on", "off"):
+            val = cmd[i + 1]
+            i += 1
+            if val == "on":
+                patched.append("--fit")
+            # "off" → drop entirely
+        elif arg == "--fit-ctx":
+            i += 1  # skip the value
+        elif arg == "--fit-target":
+            patched.append("--fit-margin")
+        else:
+            patched.append(arg)
+
+        i += 1
+
+    return patched
diff --git a/modules/shared.py b/modules/shared.py
index acb103b4..c50736d7 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -110,6 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
 group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside <venv>/lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')

From bda95172bd6abecba165fc118f140cfc446f3c42 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 06:09:53 -0700
Subject: [PATCH 174/210] Fix stopping string detection for chromadb/context-1

---
 modules/chat.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index f8088e0f..edda11b0 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -671,7 +671,10 @@ def get_stopping_strings(state):
     # Handle GPT-OSS as a special case
     if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
         result.remove("<|end|>")
-        result.append("<|result|>")
+        if '<|result|>' in state['instruction_template_str']:
+            result.append("<|result|>")
+        elif '<|return|>' in state['instruction_template_str']:
+            result.append("<|return|>")
         result = list(set(result))
 
     if shared.args.verbose:

From 9dd04b86ce407507bcaf0862b97aadc64b6e62a6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 06:17:57 -0700
Subject: [PATCH 175/210] Suppress EOS token at logit level for ExLlamav3 when
 ban_eos_token is set

---
 modules/exllamav3.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 75c76c7c..f873503a 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -423,6 +423,15 @@ class Exllamav3Model:
         if logit_bias:
             filters.append(LogitBiasFilter(self.tokenizer, logit_bias))
 
+        # Suppress EOS tokens via logit bias so they are never sampled
+        if state['ban_eos_token']:
+            eos_bias = {}
+            for eos_id in self.config.eos_token_id_list:
+                if eos_id is not None:
+                    eos_bias[str(eos_id)] = float('-inf')
+            if eos_bias:
+                filters.append(LogitBiasFilter(self.tokenizer, eos_bias))
+
         # Logprobs support (OpenAI API)
         logprobs = state.get('logprobs', 0) or 0
         return_top_tokens = logprobs if logprobs > 0 else 0

From 4979e87e48c78d5e3186e4d9b2fbc8b30e86164f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 11:49:47 -0300
Subject: [PATCH 176/210] Add ik_llama.cpp support via ik_llama_cpp_binaries
 package

---
 .github/workflows/build-everything-tgw.yml    |  35 +++
 .../build-portable-release-ik-cuda.yml        | 179 +++++++++++++++
 .../workflows/build-portable-release-ik.yml   | 205 ++++++++++++++++++
 modules/llama_cpp_server.py                   |  21 +-
 modules/loaders.py                            |   2 +
 modules/shared.py                             |   2 +-
 modules/ui_model_menu.py                      |   3 +
 requirements/full/requirements.txt            |   6 +-
 requirements/full/requirements_amd.txt        |   4 +-
 .../full/requirements_apple_intel.txt         |   3 +-
 .../full/requirements_apple_silicon.txt       |   3 +-
 requirements/full/requirements_cpu_only.txt   |   6 +-
 requirements/portable/requirements.txt        |   4 +-
 requirements/portable/requirements_amd.txt    |   4 +-
 .../portable/requirements_apple_intel.txt     |   2 +-
 .../portable/requirements_apple_silicon.txt   |   2 +-
 .../portable/requirements_cpu_only.txt        |   4 +-
 .../portable/requirements_cuda131.txt         |   4 +-
 requirements/portable/requirements_vulkan.txt |   4 +-
 19 files changed, 469 insertions(+), 24 deletions(-)
 create mode 100644 .github/workflows/build-portable-release-ik-cuda.yml
 create mode 100644 .github/workflows/build-portable-release-ik.yml

diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 9322f859..4de591f4 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -68,3 +68,38 @@ jobs:
     with:
       version: ${{ inputs.version }}
       config: 'os:macos-15-intel,macos-14'
+
+  build_release_ik_cuda_windows:
+    name: ik CUDA Windows
+    uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
+  build_release_ik_cuda_linux:
+    name: ik CUDA Linux
+    uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
+
+  build_release_ik_cpu_windows:
+    name: ik CPU Windows
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
+  build_release_ik_cpu_linux:
+    name: ik CPU Linux
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
+
+  build_release_ik_macos:
+    name: ik macOS
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:macos-14'
diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml
new file mode 100644
index 00000000..40b4b92f
--- /dev/null
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@@ -0,0 +1,179 @@
+name: Build ik CUDA
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04', 'windows-2022')
+              'pyver' = @("3.13")
+              'cuda' = @("12.4", "13.1")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }} CUDA ${{ matrix.cuda }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            CUDA_VERSION="${{ matrix.cuda }}"
+            VERSION="${{ inputs.version }}"
+
+            # 1. Set platform-specific variables
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                PLATFORM="windows"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/python.exe -m pip"
+                PACKAGES_PATH="portable_env/Lib/site-packages"
+                rm start_linux.sh start_macos.sh
+            else
+                PLATFORM="linux"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_macos.sh start_windows.bat
+            fi
+
+            # 2. Download and extract Python
+            cd ..
+            echo "Downloading Python for $PLATFORM..."
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file based on CUDA version
+            cd "text-generation-webui-${VERSION_CLEAN}"
+            if [[ "$CUDA_VERSION" == "13.1" ]]; then
+                REQ_FILE="requirements/portable/requirements_cuda131.txt"
+            else
+                REQ_FILE="requirements/portable/requirements.txt"
+            fi
+
+            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
+            sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true
+
+            # 5. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 6. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 7. Create archive
+            cd ..
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+            else
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+            fi
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-ik-*
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml
new file mode 100644
index 00000000..afb2e763
--- /dev/null
+++ b/.github/workflows/build-portable-release-ik.yml
@@ -0,0 +1,205 @@
+name: Build ik CPU and macOS
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
+              'pyver' = @("3.13")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            VERSION="${{ inputs.version }}"
+            OS_TYPE="${{ matrix.os }}"
+
+            # 1. Set platform-specific variables
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                PLATFORM="windows-cpu"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/python.exe -m pip"
+                PACKAGES_PATH="portable_env/Lib/site-packages"
+                rm start_linux.sh start_macos.sh
+            elif [[ "$RUNNER_OS" == "macOS" ]]; then
+                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
+                    PLATFORM="macos-x86_64"
+                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz"
+                    REQ_TYPE="apple_intel"
+                else
+                    PLATFORM="macos-arm64"
+                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz"
+                    REQ_TYPE="apple_silicon"
+                fi
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_linux.sh start_windows.bat
+            else
+                # Linux case
+                PLATFORM="linux-cpu"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_macos.sh start_windows.bat
+            fi
+
+            # 2. Download and extract Python
+            echo "Downloading Python for $PLATFORM..."
+            cd ..
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file based on platform
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Select requirements file based on platform
+            if [[ "$RUNNER_OS" == "macOS" ]]; then
+                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
+                    REQ_FILE="requirements/portable/requirements_apple_intel.txt"
+                else
+                    REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
+                fi
+            else
+                REQ_FILE="requirements/portable/requirements_cpu_only.txt"
+            fi
+
+            echo "Using requirements file: $REQ_FILE"
+
+            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
+            if [[ "$RUNNER_OS" == "macOS" ]]; then
+                sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+                sed -i '' 's/--portable/--portable --ik/g' start_macos.sh
+            else
+                sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+                sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
+            fi
+
+            # 5. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 6. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 7. Create archive
+            cd ..
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+            else
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+            fi
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-ik-*
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 9b9756a9..5e2decfa 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -11,7 +11,6 @@ import time
 from pathlib import Path
 from typing import Any, List
 
-import llama_cpp_binaries
 import requests
 
 from modules import shared
@@ -357,7 +356,16 @@ class LlamaServer:
         """Start the llama.cpp server and wait until it's ready."""
         # Determine the server path
         if self.server_path is None:
-            self.server_path = llama_cpp_binaries.get_binary_path()
+            if shared.args.ik:
+                try:
+                    import ik_llama_cpp_binaries
+                except ImportError:
+                    raise ImportError("--ik requires the ik_llama_cpp_binaries package. Install it with: pip install <ik_llama_cpp_binaries wheel URL>")
+
+                self.server_path = ik_llama_cpp_binaries.get_binary_path()
+            else:
+                import llama_cpp_binaries
+                self.server_path = llama_cpp_binaries.get_binary_path()
 
         # Build the command
         cmd = [
@@ -616,10 +624,12 @@ def filter_stderr_with_progress(process_stderr):
 def _patch_cmd_for_ik(cmd):
     """
     Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
-      --no-webui          → --webui none
+      --no-webui           → --webui none
       --fit off            → (removed)
       --fit on / --fit-ctx → --fit (bare flag)
       --fit-target         → --fit-margin
+      --cache-reuse        → (removed, unsupported)
+      --swa-full           → (removed, unsupported)
     """
     patched = []
     i = 0
@@ -635,9 +645,14 @@ def _patch_cmd_for_ik(cmd):
                 patched.append("--fit")
             # "off" → drop entirely
         elif arg == "--fit-ctx":
+            patched.append("--fit")
             i += 1  # skip the value
         elif arg == "--fit-target":
             patched.append("--fit-margin")
+        elif arg == "--cache-reuse":
+            i += 1  # skip the value
+        elif arg == "--swa-full":
+            pass  # bare flag, just drop it
         else:
             patched.append(arg)
 
diff --git a/modules/loaders.py b/modules/loaders.py
index c90f2ebb..cb1f3d3b 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -20,6 +20,7 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'mlock',
         'numa',
+        'ik',
         'parallel',
         'model_draft',
         'draft_max',
@@ -345,6 +346,7 @@ def list_model_elements():
         'spec_ngram_size_m',
         'spec_ngram_min_hits',
         'mmproj',
+        'ik',
     ]
 
 
diff --git a/modules/shared.py b/modules/shared.py
index c50736d7..13843f0c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -110,7 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
 group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
-group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside <venv>/lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. Requires the ik_llama_cpp_binaries package to be installed.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 5b7621a7..16505afa 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -51,6 +51,9 @@ def create_ui():
 
                         with gr.Column():
                             shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
+                            if not shared.args.portable:
+                                shared.gradio['ik'] = gr.Checkbox(label="ik", value=shared.args.ik, info='Use ik_llama.cpp instead of upstream llama.cpp.')
+
                             shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 56619627..100c99d1 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 620683cc..66fa4ac7 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b1f109b2..98dc8be6 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index a54476a9..e33264cf 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index be82c904..cd083f6d 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,5 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 188da380..67182225 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 4562b6d0..5f5b2f8d 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 04dcf25e..f5f7d6ee 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 4b8af78a..e51fc296 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 5b0eaf89..683f94c8 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 90b3234f..942d0877 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index ea72b4ec..ae784e00 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From be6fc0663ac1b7a60b7fde24afb38de2b0aba57b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 08:11:28 -0700
Subject: [PATCH 177/210] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 14 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 100c99d1..6e11dd2f 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 66fa4ac7..c964eff6 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 98dc8be6..b1dd6a4f 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index e33264cf..4d03d280 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index cd083f6d..9d41d069 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 77c254e6..052085cc 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 67182225..ff80b6c8 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 5f5b2f8d..318044da 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index f5f7d6ee..1676bffb 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index e51fc296..27fc2da8 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 683f94c8..0bbdd30a 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 942d0877..c3ae3c57 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index e8457909..e38140ce 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index ae784e00..e646c04c 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 0466b6e2714a05c04eff0c929f15e4679f029e8d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 29 Mar 2026 15:52:36 -0700
Subject: [PATCH 178/210] ik_llama.cpp: Auto-enable Hadamard KV cache rotation
 with quantized cache

---
 modules/llama_cpp_server.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 5e2decfa..fa968be1 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -631,6 +631,12 @@ def _patch_cmd_for_ik(cmd):
       --cache-reuse        → (removed, unsupported)
       --swa-full           → (removed, unsupported)
     """
+    # Add Hadamard KV cache rotation when using quantized cache types.
+    # This significantly improves quantized cache quality (especially q4_0)
+    # and is a no-op for MLA models like DeepSeek.
+    if shared.args.cache_type in ("q8_0", "q4_0"):
+        cmd += ["-khad", "-vhad"]
+
     patched = []
     i = 0
     while i < len(cmd):

From 6382fbef8381bf60ff909b4fd76e7c1f4c063afc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 30 Mar 2026 17:44:19 -0700
Subject: [PATCH 179/210] Several small code simplifications

---
 download-model.py        |  25 +++---
 js/dark_theme.js         |  12 ++-
 js/global_scope_js.js    |  79 +++++++++---------
 js/main.js               | 171 +++++++++++++--------------------------
 js/save_files.js         |  18 ++---
 js/show_controls.js      |  21 ++---
 js/switch_tabs.js        |  24 ++----
 js/update_big_picture.js |   3 +-
 modules/extensions.py    |  22 +++--
 9 files changed, 140 insertions(+), 235 deletions(-)

diff --git a/download-model.py b/download-model.py
index 95d25e16..a31bbfc6 100644
--- a/download-model.py
+++ b/download-model.py
@@ -158,28 +158,21 @@ class ModelDownloader:
         # Also if GGUF and safetensors are available, download only safetensors
         if (has_pytorch or has_pt or has_gguf) and has_safetensors:
             has_gguf = False
-            for i in range(len(classifications) - 1, -1, -1):
-                if classifications[i] in ['pytorch', 'pt', 'gguf']:
-                    links.pop(i)
-                    file_sizes.pop(i)
+            keep = [i for i, c in enumerate(classifications) if c not in ['pytorch', 'pt', 'gguf']]
+            links = [links[i] for i in keep]
+            file_sizes = [file_sizes[i] for i in keep]
 
         # For GGUF, try to download only the Q4_K_M if no specific file is specified.
         if has_gguf and specific_file is None:
-            has_q4km = False
-            for i in range(len(classifications) - 1, -1, -1):
-                if 'q4_k_m' in links[i].lower():
-                    has_q4km = True
+            has_q4km = any('q4_k_m' in link.lower() for link in links)
 
             if has_q4km:
-                for i in range(len(classifications) - 1, -1, -1):
-                    if 'q4_k_m' not in links[i].lower():
-                        links.pop(i)
-                        file_sizes.pop(i)
+                keep = [i for i, link in enumerate(links) if 'q4_k_m' in link.lower()]
             else:
-                for i in range(len(classifications) - 1, -1, -1):
-                    if links[i].lower().endswith('.gguf'):
-                        links.pop(i)
-                        file_sizes.pop(i)
+                keep = [i for i, link in enumerate(links) if not link.lower().endswith('.gguf')]
+
+            links = [links[i] for i in keep]
+            file_sizes = [file_sizes[i] for i in keep]
 
         is_llamacpp = has_gguf and specific_file is not None
         return links, sha256, is_lora, is_llamacpp, file_sizes
diff --git a/js/dark_theme.js b/js/dark_theme.js
index 7136f5bf..9d7069e2 100644
--- a/js/dark_theme.js
+++ b/js/dark_theme.js
@@ -1,6 +1,6 @@
 function toggleDarkMode() {
   document.body.classList.toggle("dark");
-  var currentCSS = document.getElementById("highlight-css");
+  const currentCSS = document.getElementById("highlight-css");
   if (currentCSS.getAttribute("href") === "file/css/highlightjs/github-dark.min.css") {
     currentCSS.setAttribute("href", "file/css/highlightjs/github.min.css");
   } else {
@@ -9,12 +9,10 @@ function toggleDarkMode() {
 
   // Re-highlight all code blocks once stylesheet loads
   currentCSS.onload = function() {
-    const messageBodies = document.getElementById("chat").querySelectorAll(".message-body");
-    messageBodies.forEach((messageBody) => {
-      const codeBlocks = messageBody.querySelectorAll("pre code");
-      codeBlocks.forEach((codeBlock) => {
-        hljs.highlightElement(codeBlock);
-      });
+    // Clear data-highlighted so hljs will re-process with the new theme
+    document.querySelectorAll("#chat .message-body pre code[data-highlighted]").forEach((codeBlock) => {
+      delete codeBlock.dataset.highlighted;
     });
+    doSyntaxHighlighting();
   };
 }
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 92f65622..20eeef66 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -1,11 +1,35 @@
+// -------------------------------------------------
+// Shared helpers
+// -------------------------------------------------
+
+function getProfilePictureUrl() {
+  return "/file/user_data/cache/pfp_character.png?time=" + Date.now();
+}
+
+const MESSAGE_SELECTOR = ".message, .user-message, .assistant-message";
+
+function getMessageElement(element) {
+  if (!element) return null;
+  return element.closest(MESSAGE_SELECTOR);
+}
+
+function isUserRole(messageElement) {
+  return messageElement.classList.contains("user-message") ||
+         messageElement.querySelector(".text-you") !== null ||
+         messageElement.querySelector(".circle-you") !== null;
+}
+
+// Trigger a synthetic 'input' event so Gradio picks up programmatic value changes
+function dispatchGradioInput(element) {
+  element.dispatchEvent(new Event("input", { bubbles: true }));
+}
+
 // -------------------------------------------------
 // Event handlers
 // -------------------------------------------------
 
 function copyToClipboard(element) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
   if (!messageElement) return;
 
   const rawText = messageElement.getAttribute("data-raw");
@@ -48,9 +72,7 @@ function fallbackCopyToClipboard(text) {
 }
 
 function branchHere(element) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
   if (!messageElement) return;
 
   const index = messageElement.getAttribute("data-index");
@@ -69,11 +91,7 @@ function branchHere(element) {
   }
 
   branchIndexInput.value = index;
-
-  // Trigger any 'change' or 'input' events Gradio might be listening for
-  const event = new Event("input", { bubbles: true });
-  branchIndexInput.dispatchEvent(event);
-
+  dispatchGradioInput(branchIndexInput);
   branchButton.click();
 }
 
@@ -82,9 +100,7 @@ function branchHere(element) {
 // -------------------------------------------------
 
 function editHere(buttonElement) {
-  if (!buttonElement) return;
-
-  const messageElement = buttonElement.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(buttonElement);
   if (!messageElement) return;
 
   const messageBody = messageElement.querySelector(".message-body");
@@ -97,12 +113,7 @@ function editHere(buttonElement) {
     return;
   }
 
-  // Determine role based on message element - handle different chat modes
-  const isUserMessage = messageElement.classList.contains("user-message") ||
-                       messageElement.querySelector(".text-you") !== null ||
-                       messageElement.querySelector(".circle-you") !== null;
-
-  startEditing(messageElement, messageBody, isUserMessage);
+  startEditing(messageElement, messageBody, isUserRole(messageElement));
 }
 
 function startEditing(messageElement, messageBody, isUserMessage) {
@@ -209,30 +220,22 @@ function submitMessageEdit(index, newText, isUserMessage) {
   editTextInput.value = newText;
   editRoleInput.value = isUserMessage ? "user" : "assistant";
 
-  editIndexInput.dispatchEvent(new Event("input", { bubbles: true }));
-  editTextInput.dispatchEvent(new Event("input", { bubbles: true }));
-  editRoleInput.dispatchEvent(new Event("input", { bubbles: true }));
+  dispatchGradioInput(editIndexInput);
+  dispatchGradioInput(editTextInput);
+  dispatchGradioInput(editRoleInput);
 
   editButton.click();
   return true;
 }
 
 function navigateVersion(element, direction) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
   if (!messageElement) return;
 
   const index = messageElement.getAttribute("data-index");
   if (!index) return;
 
-  // Determine role based on message element classes
-  let role = "assistant"; // Default role
-  if (messageElement.classList.contains("user-message") ||
-      messageElement.querySelector(".text-you") ||
-      messageElement.querySelector(".circle-you")) {
-    role = "user";
-  }
+  const role = isUserRole(messageElement) ? "user" : "assistant";
 
   const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
   const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
@@ -248,11 +251,9 @@ function navigateVersion(element, direction) {
   directionInput.value = direction;
   roleInput.value = role;
 
-  // Trigger 'input' events for Gradio to pick up changes
-  const event = new Event("input", { bubbles: true });
-  indexInput.dispatchEvent(event);
-  directionInput.dispatchEvent(event);
-  roleInput.dispatchEvent(event);
+  dispatchGradioInput(indexInput);
+  dispatchGradioInput(directionInput);
+  dispatchGradioInput(roleInput);
 
   navigateButton.click();
 }
@@ -313,7 +314,7 @@ function handleMorphdomUpdate(data) {
 
 function applyMorphdomUpdate(data) {
   // Determine target element and use it as query scope
-  var target_element, target_html;
+  let target_element, target_html;
   if (data.last_message_only) {
     const childNodes = document.getElementsByClassName("messages")[0].childNodes;
     target_element = childNodes[childNodes.length - 1];
diff --git a/js/main.js b/js/main.js
index f05f93c6..cba4c903 100644
--- a/js/main.js
+++ b/js/main.js
@@ -4,8 +4,9 @@
 
 // Sync highlight.js theme with the actual Gradio theme
 var defined_hljs_css = document.body.classList.contains("dark") ? "file/css/highlightjs/github-dark.min.css" : "file/css/highlightjs/github.min.css";
-if (document.getElementById("highlight-css").getAttribute("href") !== defined_hljs_css) {
-  document.getElementById("highlight-css").setAttribute("href", defined_hljs_css);
+var hljsCssElement = document.getElementById("highlight-css");
+if (hljsCssElement.getAttribute("href") !== defined_hljs_css) {
+  hljsCssElement.setAttribute("href", defined_hljs_css);
 }
 
 let main_parent = document.getElementById("chat-tab").parentNode;
@@ -49,21 +50,18 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
 //------------------------------------------------
 
 // --- Helper functions --- //
-function isModifiedKeyboardEvent() {
-  return (event instanceof KeyboardEvent &&
-    event.shiftKey ||
-    event.ctrlKey ||
-    event.altKey ||
-    event.metaKey);
+function isModifiedKeyboardEvent(event) {
+  return event instanceof KeyboardEvent &&
+    (event.shiftKey || event.ctrlKey || event.altKey || event.metaKey);
 }
 
-function isFocusedOnEditableTextbox() {
+function isFocusedOnEditableTextbox(event) {
   if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") {
     return !!event.target.value;
   }
+  return false;
 }
 
-let previousTabId = "chat-tab-button";
 document.addEventListener("keydown", function(event) {
   // Stop generation on Esc pressed
   if (event.key === "Escape") {
@@ -117,14 +115,14 @@ document.addEventListener("keydown", function(event) {
   }
 
   // --- Simple version navigation --- //
-  if (!isFocusedOnEditableTextbox()) {
+  if (!isFocusedOnEditableTextbox(event)) {
     // Version navigation on Arrow keys (horizontal)
-    if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") {
+    if (!isModifiedKeyboardEvent(event) && event.key === "ArrowLeft") {
       event.preventDefault();
       navigateLastAssistantMessage("left");
     }
 
-    else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") {
+    else if (!isModifiedKeyboardEvent(event) && event.key === "ArrowRight") {
       event.preventDefault();
       if (!navigateLastAssistantMessage("right")) {
         // If can't navigate right (last version), regenerate
@@ -159,9 +157,8 @@ targetElement.addEventListener("scroll", function() {
   let diff = targetElement.scrollHeight - targetElement.clientHeight;
   let isAtBottomNow = Math.abs(targetElement.scrollTop - diff) <= 10 || diff <= 0;
 
-  // Add scrolling class to disable hover effects
   if (window.isScrolled || !isAtBottomNow) {
-    targetElement.classList.add("scrolling");
+    targetElement.classList.add("scrolling"); // Disables hover effects during scroll
   }
 
   if(isAtBottomNow) {
@@ -202,12 +199,8 @@ const observer = new MutationObserver(function() {
 });
 
 // Only watch for attribute changes on targetElement (e.g. _generating class)
-const config = {
-  attributes: true
-};
-
 // Start observing the target element
-observer.observe(targetElement, config);
+observer.observe(targetElement, { attributes: true });
 
 //------------------------------------------------
 // Handle syntax highlighting / LaTeX
@@ -228,7 +221,7 @@ window.doSyntaxHighlighting = function() {
   if (messageBodies.length > 0) {
     let hasSeenVisible = false;
 
-    // Go from last message to first
+    // Go from last message to first so we can early-exit once past visible area
     for (let i = messageBodies.length - 1; i >= 0; i--) {
       const messageBody = messageBodies[i];
 
@@ -243,8 +236,8 @@ window.doSyntaxHighlighting = function() {
           codeBlock.classList.add("pretty_scrollbar");
         });
 
-        // Only render math in visible elements
         const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
+        // Only render math in individually visible containers (the outer check is on the message body)
         mathContainers.forEach(container => {
           if (isElementVisibleOnScreen(container)) {
             renderMathInElement(container, {
@@ -271,7 +264,7 @@ const doSyntaxHighlighting = window.doSyntaxHighlighting;
 // Add some scrollbars
 //------------------------------------------------
 const scrollbarElements = document.querySelectorAll(".add_scrollbar textarea, .add_scrollbar .drag-drop-list");
-for(i = 0; i < scrollbarElements.length; i++) {
+for(let i = 0; i < scrollbarElements.length; i++) {
   scrollbarElements[i].classList.remove("scroll-hide");
   scrollbarElements[i].classList.add("pretty_scrollbar");
   scrollbarElements[i].style.resize = "none";
@@ -298,13 +291,13 @@ if (toolsInfo) {
 // Remove some backgrounds
 //------------------------------------------------
 const noBackgroundelements = document.querySelectorAll(".no-background");
-for(i = 0; i < noBackgroundelements.length; i++) {
+for(let i = 0; i < noBackgroundelements.length; i++) {
   noBackgroundelements[i].parentNode.style.border = "none";
   noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = "center";
 }
 
 const slimDropdownElements = document.querySelectorAll(".slim-dropdown");
-for (i = 0; i < slimDropdownElements.length; i++) {
+for (let i = 0; i < slimDropdownElements.length; i++) {
   const parentNode = slimDropdownElements[i].parentNode;
   parentNode.style.background = "transparent";
   parentNode.style.border = "0";
@@ -374,49 +367,43 @@ button.addEventListener("click", function () {
   }
 });
 
-// Add event listener for mouseleave on the button
-button.addEventListener("mouseleave", function () {
-  // Delay to prevent menu hiding when the mouse leaves the button into the menu
+// Delay to prevent menu hiding when the mouse leaves the button or menu
+function delayedHideMenu() {
   setTimeout(function () {
     if (!isMouseOverButtonOrMenu()) {
       hideMenu();
     }
   }, 100);
-});
+}
 
+// Add event listener for mouseleave on the button
+button.addEventListener("mouseleave", delayedHideMenu);
 // Add event listener for mouseleave on the menu
-menu.addEventListener("mouseleave", function () {
-  // Delay to prevent menu hide when the mouse leaves the menu into the button
-  setTimeout(function () {
-    if (!isMouseOverButtonOrMenu()) {
-      hideMenu();
-    }
-  }, 100);
-});
+menu.addEventListener("mouseleave", delayedHideMenu);
 
 // Add event listener for click anywhere in the document
 document.addEventListener("click", function (event) {
-  const target = event.target;
-
   // Check if the click is outside the button/menu and the menu is visible
   if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") {
     hideMenu();
   }
 
-  if (event.target.classList.contains("pfp_character")) {
+  const target = event.target;
+
+  if (target.classList.contains("pfp_character")) {
     toggleBigPicture();
   }
 
   // Handle sidebar clicks on mobile
   if (isMobile()) {
-  // Check if the click did NOT originate from any of the specified toggle buttons or elements
+    // Check if the click did NOT originate from any of the specified toggle buttons or elements
     if (
       target.closest("#navigation-toggle") !== navigationToggle &&
-    target.closest("#past-chats-toggle") !== pastChatsToggle &&
-    target.closest("#chat-controls-toggle") !== chatControlsToggle &&
-    target.closest(".header_bar") !== headerBar &&
-    target.closest("#past-chats-row") !== pastChatsRow &&
-    target.closest("#chat-controls") !== chatControlsRow
+      target.closest("#past-chats-toggle") !== pastChatsToggle &&
+      target.closest("#chat-controls-toggle") !== chatControlsToggle &&
+      target.closest(".header_bar") !== headerBar &&
+      target.closest("#past-chats-row") !== pastChatsRow &&
+      target.closest("#chat-controls") !== chatControlsRow
     ) {
       handleIndividualSidebarClose(event);
     }
@@ -433,27 +420,19 @@ document.getElementById("chat-input-row").classList.add("chat-input-positioned")
 //------------------------------------------------
 const chatTextArea = document.getElementById("chat-input").querySelector("textarea");
 
-function respondToChatInputVisibility(element, callback) {
-  var options = {
-    root: document.documentElement,
-  };
-
-  var observer = new IntersectionObserver((entries, observer) => {
+function focusOnVisible(element) {
+  var observer = new IntersectionObserver((entries) => {
     entries.forEach(entry => {
-      callback(entry.intersectionRatio > 0);
+      if (entry.intersectionRatio > 0) {
+        element.focus();
+      }
     });
-  }, options);
+  }, { root: document.documentElement });
 
   observer.observe(element);
 }
 
-function handleChatInputVisibilityChange(isVisible) {
-  if (isVisible) {
-    chatTextArea.focus();
-  }
-}
-
-respondToChatInputVisibility(chatTextArea, handleChatInputVisibilityChange);
+focusOnVisible(chatTextArea);
 
 //------------------------------------------------
 // Show enlarged character picture when the profile
@@ -463,8 +442,7 @@ let bigPictureVisible = false;
 
 function addBigPicture() {
   var imgElement = document.createElement("img");
-  var timestamp = new Date().getTime();
-  imgElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
+  imgElement.src = getProfilePictureUrl();
   imgElement.classList.add("bigProfilePicture");
   imgElement.addEventListener("load", function () {
     this.style.visibility = "visible";
@@ -478,9 +456,8 @@ function addBigPicture() {
 }
 
 function deleteBigPicture() {
-  var bigProfilePictures = document.querySelectorAll(".bigProfilePicture");
-  bigProfilePictures.forEach(function (element) {
-    element.parentNode.removeChild(element);
+  document.querySelectorAll(".bigProfilePicture").forEach(function (element) {
+    element.remove();
   });
 }
 
@@ -494,44 +471,11 @@ function toggleBigPicture() {
   }
 }
 
-//------------------------------------------------
-// Handle the chat input box growth
-//------------------------------------------------
-
-// Cache DOM elements
-const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
-const chatInput = document.querySelector("#chat-input textarea");
-
-// Variables to store current dimensions
-let currentChatInputHeight = chatInput.clientHeight;
-
 //------------------------------------------------
 // Focus on the rename text area when it becomes visible
 //------------------------------------------------
 const renameTextArea = document.getElementById("rename-row").querySelector("textarea");
-
-function respondToRenameVisibility(element, callback) {
-  var options = {
-    root: document.documentElement,
-  };
-
-  var observer = new IntersectionObserver((entries, observer) => {
-    entries.forEach(entry => {
-      callback(entry.intersectionRatio > 0);
-    });
-  }, options);
-
-  observer.observe(element);
-}
-
-
-function handleVisibilityChange(isVisible) {
-  if (isVisible) {
-    renameTextArea.focus();
-  }
-}
-
-respondToRenameVisibility(renameTextArea, handleVisibilityChange);
+focusOnVisible(renameTextArea);
 
 //------------------------------------------------
 // Adjust the chat tab margin if no extension UI
@@ -737,21 +681,21 @@ function handleIndividualSidebarClose(event) {
 
   // Close navigation bar if click is outside and it is open
   if (!headerBar.contains(target) && !headerBar.classList.contains("sidebar-hidden")) {
-    toggleSidebar(headerBar, navigationToggle, true);
+    toggleSidebar(headerBar, navigationToggle);
   }
 
   // Close past chats row if click is outside and it is open
   if (!pastChatsRow.contains(target) && !pastChatsRow.classList.contains("sidebar-hidden")) {
-    toggleSidebar(pastChatsRow, pastChatsToggle, true);
+    toggleSidebar(pastChatsRow, pastChatsToggle);
   }
 
   // Close chat controls row if click is outside and it is open
   if (!chatControlsRow.contains(target) && !chatControlsRow.classList.contains("sidebar-hidden")) {
-    toggleSidebar(chatControlsRow, chatControlsToggle, true);
+    toggleSidebar(chatControlsRow, chatControlsToggle);
   }
 }
 
-function toggleSidebar(sidebar, toggle, forceClose = false) {
+function toggleSidebar(sidebar, toggle) {
   const isCurrentlyHidden = sidebar.classList.contains("sidebar-hidden");
   const shouldClose = !isCurrentlyHidden;
 
@@ -776,11 +720,6 @@ function toggleSidebar(sidebar, toggle, forceClose = false) {
     toggle.classList.toggle("chat-controls-open", !shouldClose);
     toggle.innerHTML = shouldClose ? leftArrowSVG : rightArrowSVG;
   }
-
-  // Mobile handling
-  if (isMobile()) {
-    sidebar.classList.toggle("sidebar-shown", !shouldClose);
-  }
 }
 
 // Function to check if the device is mobile
@@ -840,17 +779,17 @@ pastChatsToggle.addEventListener("click", () => {
   const isCurrentlyOpen = !pastChatsRow.classList.contains("sidebar-hidden");
   toggleSidebar(pastChatsRow, pastChatsToggle);
 
-  // On desktop, open/close both sidebars at the same time
+  // On desktop, sync both sidebars together
   if (!isMobile()) {
     if (isCurrentlyOpen) {
       // If we just closed the left sidebar, also close the right sidebar
       if (!chatControlsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(chatControlsRow, chatControlsToggle, true);
+        toggleSidebar(chatControlsRow, chatControlsToggle);
       }
     } else {
       // If we just opened the left sidebar, also open the right sidebar
       if (chatControlsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(chatControlsRow, chatControlsToggle, false);
+        toggleSidebar(chatControlsRow, chatControlsToggle);
       }
     }
   }
@@ -860,17 +799,17 @@ chatControlsToggle.addEventListener("click", () => {
   const isCurrentlyOpen = !chatControlsRow.classList.contains("sidebar-hidden");
   toggleSidebar(chatControlsRow, chatControlsToggle);
 
-  // On desktop, open/close both sidebars at the same time
+  // On desktop, sync both sidebars together
   if (!isMobile()) {
     if (isCurrentlyOpen) {
       // If we just closed the right sidebar, also close the left sidebar
       if (!pastChatsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(pastChatsRow, pastChatsToggle, true);
+        toggleSidebar(pastChatsRow, pastChatsToggle);
       }
     } else {
       // If we just opened the right sidebar, also open the left sidebar
       if (pastChatsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(pastChatsRow, pastChatsToggle, false);
+        toggleSidebar(pastChatsRow, pastChatsToggle);
       }
     }
   }
@@ -890,7 +829,7 @@ if (isMobile()) {
   const textarea = document.querySelector("#chat-input textarea");
 
   if (textarea) {
-    // Simulate adding and removing a newline
+    // Force textarea height recalculation by simulating content change
     textarea.value += "\n";
     textarea.dispatchEvent(new Event("input", { bubbles: true }));
     textarea.value = textarea.value.slice(0, -1);
diff --git a/js/save_files.js b/js/save_files.js
index bdb0e334..c3cbf9ff 100644
--- a/js/save_files.js
+++ b/js/save_files.js
@@ -1,10 +1,9 @@
 // Functions for downloading JSON files
 function getCurrentTimestamp() {
   const now = new Date();
-  const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert to milliseconds
+  const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert minutes to milliseconds
   const localTime = new Date(now.getTime() - timezoneOffset);
-  const formattedTimestamp = localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
-  return formattedTimestamp;
+  return localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
 }
 
 function saveFile(contents, filename) {
@@ -18,23 +17,18 @@ function saveFile(contents, filename) {
 }
 
 function saveHistory(history, character, mode) {
-  let path = null;
+  let path;
 
   if (["chat", "chat-instruct"].includes(mode) && character && character.trim() !== "") {
     path = `history_${character}_${getCurrentTimestamp()}.json`;
   } else {
-    try {
-      path = `history_${mode}_${getCurrentTimestamp()}.json`;
-    } catch (error) {
-      path = `history_${getCurrentTimestamp()}.json`;
-    }
+    path = `history_${mode || "unknown"}_${getCurrentTimestamp()}.json`;
   }
+
   saveFile(history, path);
 }
 
 function saveSession(session) {
-  let path = null;
-
-  path = `session_${getCurrentTimestamp()}.json`;
+  const path = `session_${getCurrentTimestamp()}.json`;
   saveFile(session, path);
 }
diff --git a/js/show_controls.js b/js/show_controls.js
index ff513395..d5642dc4 100644
--- a/js/show_controls.js
+++ b/js/show_controls.js
@@ -1,13 +1,11 @@
-const chatParent = document.querySelector(".chat-parent");
-
 function toggle_controls(value) {
+  const navToggle = document.getElementById("navigation-toggle");
+  const pastChatsToggle = document.getElementById("past-chats-toggle");
   const extensions = document.querySelector("#extensions");
+  const galleryExtension = document.getElementById("gallery-extension");
 
   if (value) {
     // SHOW MODE: Click toggles to show hidden sidebars
-    const navToggle = document.getElementById("navigation-toggle");
-    const pastChatsToggle = document.getElementById("past-chats-toggle");
-
     if (navToggle && document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
       navToggle.click();
     }
@@ -19,17 +17,11 @@ function toggle_controls(value) {
     if (extensions) {
       extensions.style.display = "inherit";
     }
-
-    let gallery_element = document.getElementById("gallery-extension");
-    if (gallery_element) {
-      gallery_element.style.display = "block";
+    if (galleryExtension) {
+      galleryExtension.style.display = "block";
     }
-
   } else {
     // HIDE MODE: Click toggles to hide visible sidebars
-    const navToggle = document.getElementById("navigation-toggle");
-    const pastChatsToggle = document.getElementById("past-chats-toggle");
-
     if (navToggle && !document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
       navToggle.click();
     }
@@ -41,5 +33,8 @@ function toggle_controls(value) {
     if (extensions) {
       extensions.style.display = "none";
     }
+    if (galleryExtension) {
+      galleryExtension.style.display = "none";
+    }
   }
 }
diff --git a/js/switch_tabs.js b/js/switch_tabs.js
index 36e5736b..a1b44ef3 100644
--- a/js/switch_tabs.js
+++ b/js/switch_tabs.js
@@ -2,17 +2,9 @@ function scrollToTop() {
   window.scrollTo({ top: 0 });
 }
 
-function findButtonsByText(buttonText) {
-  const buttons = document.getElementsByTagName("button");
-  const matchingButtons = [];
-
-  for (let i = 0; i < buttons.length; i++) {
-    if (buttons[i].textContent.trim() === buttonText) {
-      matchingButtons.push(buttons[i]);
-    }
-  }
-
-  return matchingButtons;
+function findButtonsByText(buttonText, container = document) {
+  return Array.from(container.getElementsByTagName("button"))
+    .filter(btn => btn.textContent.trim() === buttonText);
 }
 
 function switch_to_chat() {
@@ -39,13 +31,9 @@ function switch_to_character() {
 
 function switch_to_image_ai_generate() {
   const container = document.querySelector("#image-ai-tab");
-  const buttons = container.getElementsByTagName("button");
-
-  for (let i = 0; i < buttons.length; i++) {
-    if (buttons[i].textContent.trim() === "Generate") {
-      buttons[i].click();
-      break;
-    }
+  const generateBtn = findButtonsByText("Generate", container)[0];
+  if (generateBtn) {
+    generateBtn.click();
   }
 
   scrollToTop();
diff --git a/js/update_big_picture.js b/js/update_big_picture.js
index ec51d63b..8f638c99 100644
--- a/js/update_big_picture.js
+++ b/js/update_big_picture.js
@@ -1,7 +1,6 @@
 function updateBigPicture() {
   var existingElement = document.querySelector(".bigProfilePicture");
   if (existingElement) {
-    var timestamp = new Date().getTime();
-    existingElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
+    existingElement.src = getProfilePictureUrl();
   }
 }
diff --git a/modules/extensions.py b/modules/extensions.py
index 09db9f40..afe847f0 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -191,21 +191,19 @@ def _apply_custom_generate_reply():
 
 
 def _apply_custom_css():
-    all_css = ''
-    for extension, _ in iterator():
-        if hasattr(extension, 'custom_css'):
-            all_css += getattr(extension, 'custom_css')()
-
-    return all_css
+    return ''.join(
+        getattr(extension, 'custom_css')()
+        for extension, _ in iterator()
+        if hasattr(extension, 'custom_css')
+    )
 
 
 def _apply_custom_js():
-    all_js = ''
-    for extension, _ in iterator():
-        if hasattr(extension, 'custom_js'):
-            all_js += getattr(extension, 'custom_js')()
-
-    return all_js
+    return ''.join(
+        getattr(extension, 'custom_js')()
+        for extension, _ in iterator()
+        if hasattr(extension, 'custom_js')
+    )
 
 
 def create_extensions_block():

From 71c1a52afe54ab599ab5849ae80f1d5a3a72fb5a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 30 Mar 2026 20:49:38 -0700
Subject: [PATCH 180/210] API: Implement echo + logprobs for /v1/completions
 endpoint

---
 modules/api/completions.py  | 299 ++++++++++++++++++++++++++++++------
 modules/exllamav3.py        |  26 +++-
 modules/llama_cpp_server.py |  39 ++++-
 3 files changed, 309 insertions(+), 55 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 8948bb86..587ad6ea 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -39,6 +39,129 @@ def load_chat_template_file(filepath):
     return text
 
 
+def _first_token_display_str(token_id, prompt, tokenizer):
+    """Return the display string for the first prompt token.
+
+    Returns empty string for BOS or tokens that don't appear at the start
+    of the prompt text, so they don't shift text_offset for subsequent tokens.
+    """
+    token_id = int(token_id)
+    bos_id = getattr(tokenizer, 'bos_token_id', None)
+    if bos_id is not None and token_id == bos_id:
+        return ""
+
+    import torch
+    tok = tokenizer.decode(torch.tensor([token_id]))
+    if not prompt.startswith(tok):
+        return ""
+
+    return tok
+
+
+def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
+    """Compute logprob entries for prompt tokens via a forward pass.
+
+    Returns a list of logprob entries in the standard format.
+    The first token gets a null entry (no conditioning context).
+
+    Supported for HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
+    via a single forward pass, and for llama.cpp via the server's
+    prompt_logprobs parameter. Returns [] for unsupported loaders.
+    """
+    if input_ids is None:
+        input_ids = encode(prompt)  # (1, seq_len) tensor or array
+
+    token_ids = input_ids[0]
+    n_tokens = len(token_ids)
+
+    if n_tokens == 0:
+        return []
+
+    loader = shared.args.loader
+    model = shared.model
+
+    if loader == 'llama.cpp':
+        return model.get_prompt_logprob_entries(token_ids, max(logprobs_count, 1), prompt=prompt)
+
+    first_token_str = _first_token_display_str(token_ids[0], prompt, shared.tokenizer)
+
+    if n_tokens <= 1:
+        return [{"token": first_token_str, "null_logprob": True}]
+
+    import torch
+
+    if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
+        # Native ExLlamav3: call the underlying Model.forward() directly
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        with torch.no_grad():
+            logits = model.model.forward(
+                input_ids=input_ids_tensor,
+                params={
+                    "attn_mode": "flash_attn",
+                    "cache": model.cache,
+                    "past_len": 0,
+                    "batch_shape": (1, model.max_tokens),
+                }
+            ).float().cpu()
+
+    elif hasattr(model, 'forward'):
+        # HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        if hasattr(model, 'device'):
+            input_ids_tensor = input_ids_tensor.to(model.device)
+        with torch.no_grad():
+            # Pass labels to ensure logits are returned for ALL positions,
+            # not just the last token (some HF wrappers like ExLlamav3_HF
+            # only compute the last-token logits when labels are absent).
+            outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor)
+            logits = outputs.logits.float().cpu()
+
+    else:
+        return []
+
+    entries = [{"token": first_token_str, "null_logprob": True}]
+
+    # Batch logsumexp and topk as single operations across all positions
+    # to avoid per-position kernel launch overhead.
+    prompt_logits = logits[0, :n_tokens - 1]  # positions 0..n-2 predict tokens 1..n-1
+    k = min(logprobs_count, prompt_logits.shape[-1])
+    all_top_values, all_top_indices = torch.topk(prompt_logits, k=k, dim=-1)
+    all_lse = torch.logsumexp(prompt_logits, dim=-1)
+    all_top_log_probs = all_top_values - all_lse.unsqueeze(-1)
+
+    # Batch-decode all unique token IDs to avoid O(N*k) individual decode calls
+    unique_ids = set(int(tid) for tid in token_ids[1:])
+    unique_ids.update(int(tid) for tid in all_top_indices.flatten().tolist())
+
+    decoded_strs = {tid: shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids}
+
+    for i in range(1, n_tokens):
+        token_id = int(token_ids[i])
+        idx = i - 1
+        top_log_probs = all_top_log_probs[idx]
+        top_ids = all_top_indices[idx].tolist()
+        actual_token_str = decoded_strs[token_id]
+
+        # Build the top list with the actual prompt token guaranteed at front
+        if token_id in top_ids:
+            actual_lp = top_log_probs[top_ids.index(token_id)].item()
+            alternatives = [
+                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                for j in range(k) if top_ids[j] != token_id
+            ]
+        else:
+            actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
+            alternatives = [
+                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                for j in range(k - 1)  # drop lowest to make room
+            ]
+
+        entry = {"top_logprobs": [{"token": actual_token_str, "logprob": actual_lp}] + alternatives}
+        entries.append(entry)
+
+    return entries
+
+
 def _get_raw_logprob_entries(offset=0):
     """Get raw logprob entries from llama.cpp/ExLlamav3 backend, starting from offset.
 
@@ -65,6 +188,21 @@ def _parse_entry_top(entry):
     return entry.get('top_logprobs', entry.get('top_probs', []))
 
 
+def _extract_sampled_token(entry, top):
+    """Get the actually sampled token and its logprob from a logprob entry.
+
+    Uses the entry-level token/logprob when available (the actually sampled
+    token), falling back to top[0] (highest-probability alternative) which
+    may differ with non-greedy sampling.
+    """
+    if 'token' in entry:
+        return entry['token'], entry.get('logprob', entry.get('prob', 0))
+
+    token_str = top[0].get('token', '')
+    token_logprob = top[0].get('logprob', top[0].get('prob', 0))
+    return token_str, token_logprob
+
+
 def format_chat_logprobs(entries):
     """Format logprob entries into OpenAI chat completions logprobs format.
 
@@ -79,9 +217,7 @@ def format_chat_logprobs(entries):
         if not top:
             continue
 
-        chosen = top[0]
-        token_str = chosen.get('token', '')
-        token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+        token_str, token_logprob = _extract_sampled_token(entry, top)
 
         top_list = []
         for item in top:
@@ -118,13 +254,21 @@ def format_completion_logprobs(entries):
     offset = 0
 
     for entry in entries:
+        # Handle null logprob entries (first prompt token with echo)
+        if entry.get("null_logprob"):
+            token_str = entry.get("token", "")
+            tokens.append(token_str)
+            token_logprobs.append(None)
+            top_logprobs.append(None)
+            text_offset.append(offset)
+            offset += len(token_str)
+            continue
+
         top = _parse_entry_top(entry)
         if not top:
             continue
 
-        chosen = top[0]
-        token_str = chosen.get('token', '')
-        token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+        token_str, token_logprob = _extract_sampled_token(entry, top)
 
         tokens.append(token_str)
         token_logprobs.append(token_logprob)
@@ -407,7 +551,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     })
 
     max_tokens = generate_params['max_new_tokens']
-    if max_tokens in [None, 0]:
+    if max_tokens is not None and max_tokens <= 0:
+        raise InvalidRequestError(message="max_tokens must be greater than 0.", param="max_tokens")
+
+    if max_tokens is None:
         generate_params['max_new_tokens'] = 512
         generate_params['auto_max_new_tokens'] = True
 
@@ -652,6 +799,15 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
     # common params
     generate_params = process_parameters(body, is_legacy=is_legacy)
     max_tokens = generate_params['max_new_tokens']
+    if max_tokens is None:
+        generate_params['max_new_tokens'] = 512
+        generate_params['auto_max_new_tokens'] = True
+        max_tokens = 512
+    elif max_tokens < 0:
+        raise InvalidRequestError(message="max_tokens must be greater than or equal to 0.", param="max_tokens")
+    elif max_tokens == 0 and body.get('logprobs') is None:
+        raise InvalidRequestError(message="max_tokens is 0 but no logprobs parameter was specified.", param="max_tokens")
+
     generate_params['stream'] = stream
     if stop_event is not None:
         generate_params['stop_event'] = stop_event
@@ -700,9 +856,17 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                         prompt = decode(prompt)[0]
 
             prefix = prompt if echo else ''
-            token_count = len(encode(prompt)[0])
+            prompt_input_ids = encode(prompt)
+            token_count = len(prompt_input_ids[0])
             total_prompt_token_count += token_count
 
+            # Compute prompt logprobs once per prompt (shared across n_completions)
+            logprobs_val = body.get('logprobs', None)
+            if echo and logprobs_val is not None and logprobs_val >= 0:
+                prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
+            else:
+                prompt_entries = None
+
             original_seed = generate_params.get('seed', -1)
             for _n in range(n_completions):
                 # Increment seed for each completion to ensure diversity (matches llama.cpp native behavior)
@@ -713,29 +877,41 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                     logprob_proc.token_alternatives_history.clear()
 
                 # generate reply #######################################
-                debug_msg({'prompt': prompt, 'generate_params': generate_params})
-                generator = generate_reply(prompt, generate_params, is_chat=False)
-                answer = ''
-
-                for a in generator:
-                    answer = a
-
-                completion_token_count = len(encode(answer)[0])
-                total_completion_token_count += completion_token_count
-                stop_reason = "stop"
-                if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
-                    stop_reason = "length"
-
-                if logprob_proc:
-                    all_entries = []
-                    for alt in logprob_proc.token_alternatives_history:
-                        all_entries.extend(_dict_to_logprob_entries(alt))
-                    completion_logprobs = format_completion_logprobs(all_entries)
-                elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
-                    raw = getattr(shared.model, 'last_completion_probabilities', None)
-                    completion_logprobs = format_completion_logprobs(raw)
+                if max_tokens == 0:
+                    answer = ''
+                    completion_token_count = 0
+                    stop_reason = "stop"
                 else:
-                    completion_logprobs = None
+                    debug_msg({'prompt': prompt, 'generate_params': generate_params})
+                    generator = generate_reply(prompt, generate_params, is_chat=False)
+                    answer = ''
+
+                    for a in generator:
+                        answer = a
+
+                    completion_token_count = len(encode(answer)[0])
+                    stop_reason = "stop"
+                    if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+                        stop_reason = "length"
+
+                total_completion_token_count += completion_token_count
+
+                if max_tokens == 0:
+                    all_entries = []
+                else:
+                    if logprob_proc:
+                        all_entries = []
+                        for alt in logprob_proc.token_alternatives_history:
+                            all_entries.extend(_dict_to_logprob_entries(alt))
+                    elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
+                        all_entries = getattr(shared.model, 'last_completion_probabilities', None) or []
+                    else:
+                        all_entries = []
+
+                if prompt_entries:
+                    all_entries = prompt_entries + all_entries
+
+                completion_logprobs = format_completion_logprobs(all_entries) if all_entries else None
 
                 respi = {
                     "index": choice_index,
@@ -775,7 +951,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                 raise InvalidRequestError(message="API Batched generation not yet supported.", param=prompt_str)
 
         prefix = prompt if echo else ''
-        token_count = len(encode(prompt)[0])
+        prompt_input_ids = encode(prompt)
+        token_count = len(prompt_input_ids[0])
 
         # Check if usage should be included in streaming chunks per OpenAI spec
         stream_options = body.get('stream_options')
@@ -808,37 +985,57 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
 
             return chunk
 
+        logprobs_val = body.get('logprobs', None)
+        if echo and logprobs_val is not None and logprobs_val >= 0:
+            prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
+            prompt_logprobs_formatted = format_completion_logprobs(prompt_entries) if prompt_entries else None
+        else:
+            prompt_logprobs_formatted = None
+
+        # Clear stale logprobs from any previous request before building the
+        # first chunk, so text_streaming_chunk doesn't pick up old data.
+        if hasattr(shared.model, 'last_completion_probabilities'):
+            shared.model.last_completion_probabilities = []
+        cmpl_logprobs_offset[0] = 0
+
         chunk = text_streaming_chunk(prefix)
+        if prompt_logprobs_formatted is not None:
+            chunk[resp_list][0]["logprobs"] = prompt_logprobs_formatted
         if include_usage:
             chunk['usage'] = None
         yield chunk
 
         # generate reply #######################################
-        debug_msg({'prompt': prompt, 'generate_params': generate_params})
-        generator = generate_reply(prompt, generate_params, is_chat=False)
-        answer = ''
-        seen_content = ''
-        completion_token_count = 0
+        if max_tokens == 0:
+            answer = ''
+            completion_token_count = 0
+            stop_reason = "stop"
+        else:
+            debug_msg({'prompt': prompt, 'generate_params': generate_params})
+            generator = generate_reply(prompt, generate_params, is_chat=False)
+            answer = ''
+            seen_content = ''
+            completion_token_count = 0
 
-        for a in generator:
-            answer = a
+            for a in generator:
+                answer = a
 
-            len_seen = len(seen_content)
-            new_content = answer[len_seen:]
+                len_seen = len(seen_content)
+                new_content = answer[len_seen:]
 
-            if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
-                continue
+                if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
+                    continue
 
-            seen_content = answer
-            chunk = text_streaming_chunk(new_content)
-            if include_usage:
-                chunk['usage'] = None
-            yield chunk
+                seen_content = answer
+                chunk = text_streaming_chunk(new_content)
+                if include_usage:
+                    chunk['usage'] = None
+                yield chunk
 
-        completion_token_count = len(encode(answer)[0])
-        stop_reason = "stop"
-        if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
-            stop_reason = "length"
+            completion_token_count = len(encode(answer)[0])
+            stop_reason = "stop"
+            if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+                stop_reason = "length"
 
         chunk = text_streaming_chunk(suffix)
         chunk[resp_list][0]["finish_reason"] = stop_reason
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index f873503a..3782a693 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -489,15 +489,35 @@ class Exllamav3Model:
             return
 
         id_to_piece = self.tokenizer.get_id_to_piece_list(True)
+        sampled_ids = result.get("token_ids")    # (batch, seq_len) - actually sampled tokens
+        sampled_probs = result.get("token_probs")  # (batch, seq_len) - their probabilities
+
+        def _piece(tid):
+            s = id_to_piece[tid] if tid < len(id_to_piece) else f"<{tid}>"
+            return s.replace('\u2581', ' ')
+
+        def _logprob(prob):
+            return math.log(prob) if prob > 0 else float("-inf")
+
         # top_k_tokens shape: (batch, seq_len, k), top_k_probs same
         for seq_idx in range(top_k_tokens.shape[1]):
             entry = {"top_logprobs": []}
             for k_idx in range(top_k_tokens.shape[2]):
                 token_id = top_k_tokens[0, seq_idx, k_idx].item()
                 prob = top_k_probs[0, seq_idx, k_idx].item()
-                token_str = id_to_piece[token_id] if token_id < len(id_to_piece) else f"<{token_id}>"
-                logprob = math.log(prob) if prob > 0 else float("-inf")
-                entry["top_logprobs"].append({"token": token_str, "logprob": logprob})
+                entry["top_logprobs"].append({"token": _piece(token_id), "logprob": _logprob(prob)})
+
+            # Record the actually sampled token at the entry level so
+            # format_completion_logprobs uses it instead of top_logprobs[0]
+            # (they differ with non-greedy sampling).
+            if sampled_ids is not None:
+                sid = sampled_ids[0, seq_idx].item()
+                entry["token"] = _piece(sid)
+                if sampled_probs is not None:
+                    entry["logprob"] = _logprob(sampled_probs[0, seq_idx].item())
+                else:
+                    entry["logprob"] = None
+
             self.last_completion_probabilities.append(entry)
 
     def generate(self, prompt, state):
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index fa968be1..34080466 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -310,8 +310,45 @@ class LlamaServer:
         else:
             raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
 
+    def get_prompt_logprob_entries(self, token_ids, n_probs=5, prompt=""):
+        """Get logprob entries for prompt tokens via a single n_predict=0 request.
+
+        Requires llama.cpp server with prompt_logprobs support.
+        Returns entries in the standard format for format_completion_logprobs().
+        """
+        token_ids_list = token_ids.tolist() if hasattr(token_ids, 'tolist') else list(token_ids)
+
+        url = f"http://127.0.0.1:{self.port}/completion"
+        payload = {
+            "prompt": token_ids_list,
+            "n_predict": 0,
+            "n_probs": n_probs,
+            "prompt_logprobs": True,
+            "stream": False,
+            "cache_prompt": False,
+        }
+
+        response = self.session.post(url, json=payload)
+        result = response.json()
+
+        prompt_probs = result.get("prompt_probabilities", [])
+        if not prompt_probs:
+            return []
+
+        # Null first token (no conditioning context); use empty string for BOS
+        # or tokens that don't appear at the start of the prompt text.
+        first_token_str = self.decode([token_ids_list[0]])
+        if self.bos_token and first_token_str == self.bos_token:
+            first_token_str = ""
+        elif not prompt.startswith(first_token_str):
+            first_token_str = ""
+
+        entries = [{"token": first_token_str, "null_logprob": True}]
+        entries.extend(prompt_probs)
+        return entries
+
     def _get_vocabulary_size(self):
-        """Get and store the model's maximum context length."""
+        """Get and store the model's vocabulary size."""
         url = f"http://127.0.0.1:{self.port}/v1/models"
         response = self.session.get(url).json()
 

From 328534b762f22c82b09babf6b04e289eab4a7fde Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 12:51:07 -0700
Subject: [PATCH 181/210] Update llama.cpp

---
 requirements/full/requirements.txt                   | 8 ++++----
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 8 ++++----
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 12 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 6e11dd2f..57991c9a 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c964eff6..bb47ea4b 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b1dd6a4f..5750b109 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 4d03d280..d8302d3d 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 9d41d069..d3a5c008 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,7 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index ff80b6c8..1180b42d 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 318044da..57aa6262 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 1676bffb..894c9199 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 27fc2da8..32b9727f 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 0bbdd30a..73b72832 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index c3ae3c57..ad96bbe2 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index e646c04c..a5df3ad4 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 4073164be0b305d8ac4a01d4259448370d009a99 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 19:08:37 -0700
Subject: [PATCH 182/210] Fix ExLlamav3 OOM on prompt logprobs and qwen3_5_moe
 HF compat

---
 modules/api/completions.py | 13 +++++--------
 modules/exllamav3.py       | 33 ++++-----------------------------
 modules/exllamav3_hf.py    | 32 ++++++++------------------------
 3 files changed, 17 insertions(+), 61 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 587ad6ea..a15e1f86 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -91,17 +91,14 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
     import torch
 
     if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
-        # Native ExLlamav3: call the underlying Model.forward() directly
+        # Native ExLlamav3: call the underlying Model.forward() in chunks
+        # to avoid OOM from giant logits tensors (seq_len * vocab_size * 4 bytes)
         input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        input_ids_tensor = input_ids_tensor.view(-1).cpu()
         with torch.no_grad():
             logits = model.model.forward(
-                input_ids=input_ids_tensor,
-                params={
-                    "attn_mode": "flash_attn",
-                    "cache": model.cache,
-                    "past_len": 0,
-                    "batch_shape": (1, model.max_tokens),
-                }
+                input_ids=input_ids_tensor.view(1, -1),
+                params={"attn_mode": "flash_attn_nc"}
             ).float().cpu()
 
     elif hasattr(model, 'forward'):
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 3782a693..7556a908 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -530,39 +530,14 @@ class Exllamav3Model:
     def get_logits(self, token_ids, **kwargs):
         """
         Process a batch of token_ids and return the logits for the last token.
-        This will reset and overwrite the model's cache.
+        Uses flash_attn_nc (no cache) for correct results with recurrent models.
         """
-        # Initialize a single params dictionary that will be updated in-place
-        params = {
-            "cache": self.cache,
-            "reconstruct": False,
-            "attn_mode": "flash_attn",
-            "batch_shape": (1, self.max_tokens),
-            "past_len": 0
-        }
-        params.update(kwargs)
-
-        # Process prefix tokens to fill the cache and generate recurrent state
-        if token_ids.shape[-1] > 1:
-            prefix_ids = token_ids[:, :-1]
-
-            # This forward call updates the 'params' dict with the recurrent state
-            self.model.forward(
-                input_ids=prefix_ids,
-                params=params
-            )
-
-            # Update past_len for the next call
-            params["past_len"] = prefix_ids.shape[-1]
-
-        # Process the last token, now using the state-filled 'params' dict
-        last_token_ids = token_ids[:, -1:]
         logits = self.model.forward(
-            input_ids=last_token_ids,
-            params=params
+            input_ids=token_ids,
+            params={"attn_mode": "flash_attn_nc"}
         )
 
-        return logits.float().cpu()
+        return logits[:, -1:, :].float().cpu()
 
     def encode(self, string, **kwargs):
         add_bos = kwargs.pop('add_bos', True)
diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index e0ad5002..5e634e22 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -26,6 +26,9 @@ except Exception:
 class Exllamav3HF(PreTrainedModel, GenerationMixin):
     def __init__(self, model_dir):
         hf_config = PretrainedConfig.from_pretrained(model_dir)
+        # Ensure text_config is a proper object, not a dict (fixes qwen3_5_moe + transformers compat)
+        if isinstance(getattr(hf_config, 'text_config', None), dict):
+            hf_config.text_config = PretrainedConfig(**hf_config.text_config)
         super().__init__(hf_config)
 
         exl3_config = Config.from_directory(model_dir)
@@ -199,30 +202,11 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
                 }
             ).to(input_ids.device).float()
         else:
-            # Labels path: use cache for cross-chunk attention.
-            tokens_to_process = seq_tensor
-            all_logits = None
-            current_len = 0
-
-            for i in range(0, tokens_to_process.shape[0], max_chunk_size):
-                chunk = tokens_to_process[i:i + max_chunk_size]
-                chunk_logits = self.ex_model.forward(
-                    input_ids=chunk.view(1, -1),
-                    params={
-                        "attn_mode": "flash_attn",
-                        "cache": ex_cache,
-                        "past_len": current_len,
-                        "batch_shape": (1, self.max_tokens),
-                    }
-                ).float()
-                current_len += chunk.shape[0]
-
-                if all_logits is None:
-                    all_logits = chunk_logits
-                else:
-                    all_logits = torch.cat([all_logits, chunk_logits], dim=1)
-
-            logits = all_logits
+            # Labels path: single pass without cache for correct logits
+            logits = self.ex_model.forward(
+                input_ids=seq_tensor.view(1, -1),
+                params={"attn_mode": "flash_attn_nc"}
+            ).float().cpu()
 
         if is_negative:
             self.past_seq_negative = seq_tensor

From a32ce254f275efe473d6624995957b3b6bd51aa1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 20:28:44 -0700
Subject: [PATCH 183/210] Don't pass torch_dtype to transformers, autodetect
 from model config

---
 modules/transformers_loader.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py
index 7f521b8c..5964f012 100644
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@@ -109,7 +109,6 @@ def load_model_HF(model_name):
     params = {
         'low_cpu_mem_usage': True,
         'attn_implementation': shared.args.attn_implementation,
-        'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
     }
 
     if shared.original_args.trust_remote_code:
@@ -120,6 +119,17 @@ def load_model_HF(model_name):
 
     config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.original_args.trust_remote_code)
 
+    # Determine torch_dtype: respect --bf16 flag, otherwise autodetect
+    # from model config, but never allow float32.
+    if shared.args.bf16:
+        params['torch_dtype'] = torch.bfloat16
+    else:
+        dtype = getattr(config, 'torch_dtype', None) or getattr(getattr(config, 'text_config', None), 'torch_dtype', None)
+        if dtype in (torch.float16, torch.bfloat16):
+            params['torch_dtype'] = dtype
+        else:
+            params['torch_dtype'] = torch.float16
+
     if 'chatglm' in model_name.lower():
         LoaderClass = AutoModel
     else:

From c10c6e87ae0b0085b36e7e13269461744ce04ff6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 07:17:27 -0700
Subject: [PATCH 184/210] API: Add token ids to logprobs output

---
 modules/api/completions.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index a15e1f86..453fa07b 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -143,17 +143,17 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         if token_id in top_ids:
             actual_lp = top_log_probs[top_ids.index(token_id)].item()
             alternatives = [
-                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
                 for j in range(k) if top_ids[j] != token_id
             ]
         else:
             actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
             alternatives = [
-                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
                 for j in range(k - 1)  # drop lowest to make room
             ]
 
-        entry = {"top_logprobs": [{"token": actual_token_str, "logprob": actual_lp}] + alternatives}
+        entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives}
         entries.append(entry)
 
     return entries
@@ -239,7 +239,7 @@ def format_chat_logprobs(entries):
 def format_completion_logprobs(entries):
     """Format logprob entries into OpenAI completions logprobs format.
 
-    Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "text_offset"}
+    Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "top_logprobs_ids": [{token_id: prob}], "text_offset"}
     """
     if not entries:
         return None
@@ -247,6 +247,7 @@ def format_completion_logprobs(entries):
     tokens = []
     token_logprobs = []
     top_logprobs = []
+    top_logprobs_ids = []
     text_offset = []
     offset = 0
 
@@ -257,6 +258,7 @@ def format_completion_logprobs(entries):
             tokens.append(token_str)
             token_logprobs.append(None)
             top_logprobs.append(None)
+            top_logprobs_ids.append(None)
             text_offset.append(offset)
             offset += len(token_str)
             continue
@@ -273,21 +275,28 @@ def format_completion_logprobs(entries):
         offset += len(token_str)
 
         top_dict = {}
+        top_dict_ids = {}
         for item in top:
             t = item.get('token', '')
             lp = item.get('logprob', item.get('prob', 0))
             top_dict[t] = lp
+            if 'token_id' in item:
+                top_dict_ids[item['token_id']] = lp
         top_logprobs.append(top_dict)
+        top_logprobs_ids.append(top_dict_ids if top_dict_ids else None)
 
     if not tokens:
         return None
 
-    return {
+    result = {
         "tokens": tokens,
         "token_logprobs": token_logprobs,
         "top_logprobs": top_logprobs,
         "text_offset": text_offset
     }
+    if any(x is not None for x in top_logprobs_ids):
+        result["top_logprobs_ids"] = top_logprobs_ids
+    return result
 
 
 def process_parameters(body, is_legacy=False):

From ea1f8c71f2e92dc9ae230b943c605e43ff5c633c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:30:59 -0300
Subject: [PATCH 185/210] API: Optimize prompt logprobs and refactor ExLlamav3
 forward pass

---
 modules/api/completions.py | 69 ++++++++++++++++++++++++--------------
 modules/exllamav3.py       | 14 ++++++++
 2 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 453fa07b..4eb8fdad 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -90,16 +90,8 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
 
     import torch
 
-    if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
-        # Native ExLlamav3: call the underlying Model.forward() in chunks
-        # to avoid OOM from giant logits tensors (seq_len * vocab_size * 4 bytes)
-        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
-        input_ids_tensor = input_ids_tensor.view(-1).cpu()
-        with torch.no_grad():
-            logits = model.model.forward(
-                input_ids=input_ids_tensor.view(1, -1),
-                params={"attn_mode": "flash_attn_nc"}
-            ).float().cpu()
+    if hasattr(model, 'get_prompt_logits'):
+        logits = model.get_prompt_logits(input_ids)
 
     elif hasattr(model, 'forward'):
         # HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
@@ -111,26 +103,54 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
             # not just the last token (some HF wrappers like ExLlamav3_HF
             # only compute the last-token logits when labels are absent).
             outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor)
-            logits = outputs.logits.float().cpu()
+            logits = outputs.logits  # keep on GPU, (1, seq_len, vocab) in model dtype
+            del outputs
 
     else:
         return []
 
     entries = [{"token": first_token_str, "null_logprob": True}]
 
-    # Batch logsumexp and topk as single operations across all positions
-    # to avoid per-position kernel launch overhead.
-    prompt_logits = logits[0, :n_tokens - 1]  # positions 0..n-2 predict tokens 1..n-1
-    k = min(logprobs_count, prompt_logits.shape[-1])
-    all_top_values, all_top_indices = torch.topk(prompt_logits, k=k, dim=-1)
-    all_lse = torch.logsumexp(prompt_logits, dim=-1)
-    all_top_log_probs = all_top_values - all_lse.unsqueeze(-1)
-
-    # Batch-decode all unique token IDs to avoid O(N*k) individual decode calls
+    logprobs_count = max(logprobs_count, 1)
+    k = min(logprobs_count, logits.shape[-1])
+    chunk_size = 2048
     unique_ids = set(int(tid) for tid in token_ids[1:])
-    unique_ids.update(int(tid) for tid in all_top_indices.flatten().tolist())
 
-    decoded_strs = {tid: shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids}
+    # Process logits in chunks on GPU, only move top-K results to CPU
+    all_top_log_probs_list = []
+    all_top_indices_list = []
+    all_actual_lps = []
+
+    for start in range(0, n_tokens - 1, chunk_size):
+        end = min(start + chunk_size, n_tokens - 1)
+        chunk_logits = logits[0, start:end].float()  # (chunk, vocab) on GPU
+        chunk_lse = torch.logsumexp(chunk_logits, dim=-1)
+        chunk_top_values, chunk_top_indices = torch.topk(chunk_logits, k=k, dim=-1)
+        chunk_top_log_probs = chunk_top_values - chunk_lse.unsqueeze(-1)
+
+        # Compute logprob for actual next tokens in this chunk
+        chunk_top_sets = [set(chunk_top_indices[j].tolist()) for j in range(end - start)]
+        for j in range(end - start):
+            actual_tid = int(token_ids[start + j + 1])
+            if actual_tid not in chunk_top_sets[j]:
+                all_actual_lps.append((chunk_logits[j, actual_tid] - chunk_lse[j]).item())
+            else:
+                all_actual_lps.append(None)  # will use top_log_probs
+
+        all_top_log_probs_list.append(chunk_top_log_probs.cpu())
+        all_top_indices_list.append(chunk_top_indices.cpu())
+        unique_ids.update(int(tid) for tid in chunk_top_indices.flatten().tolist())
+        del chunk_logits, chunk_lse, chunk_top_values
+
+    del logits
+    torch.cuda.empty_cache()
+
+    all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0)
+    all_top_indices = torch.cat(all_top_indices_list, dim=0)
+
+    unique_ids_list = sorted(unique_ids)
+    decoded_list = shared.tokenizer.batch_decode([[tid] for tid in unique_ids_list]) if hasattr(shared.tokenizer, 'batch_decode') else [shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids_list]
+    decoded_strs = dict(zip(unique_ids_list, decoded_list))
 
     for i in range(1, n_tokens):
         token_id = int(token_ids[i])
@@ -139,7 +159,6 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         top_ids = all_top_indices[idx].tolist()
         actual_token_str = decoded_strs[token_id]
 
-        # Build the top list with the actual prompt token guaranteed at front
         if token_id in top_ids:
             actual_lp = top_log_probs[top_ids.index(token_id)].item()
             alternatives = [
@@ -147,10 +166,10 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
                 for j in range(k) if top_ids[j] != token_id
             ]
         else:
-            actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
+            actual_lp = all_actual_lps[idx]
             alternatives = [
                 {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
-                for j in range(k - 1)  # drop lowest to make room
+                for j in range(k - 1)
             ]
 
         entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives}
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 7556a908..e1efbfeb 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -527,6 +527,20 @@ class Exllamav3Model:
 
         return output
 
+    def get_prompt_logits(self, input_ids):
+        """Return logits for all positions via a single no-cache forward pass.
+
+        Used by prompt logprobs computation. Returns (1, seq_len, vocab) on CPU in float32.
+        """
+        import torch
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        input_ids_tensor = input_ids_tensor.view(1, -1).cpu()
+        with torch.no_grad():
+            return self.model.forward(
+                input_ids=input_ids_tensor,
+                params={"attn_mode": "flash_attn_nc"}
+            ).cpu().float()
+
     def get_logits(self, token_ids, **kwargs):
         """
         Process a batch of token_ids and return the logits for the last token.

From c50e17bdbe1da850189188afaf0682a952efa0d1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:49:31 -0300
Subject: [PATCH 186/210] Add dedicated ik portable requirements files and
 remove macOS ik builds

---
 .github/workflows/build-everything-tgw.yml    |  7 ---
 .../build-portable-release-ik-cuda.yml        |  9 ++--
 .../workflows/build-portable-release-ik.yml   | 44 +++----------------
 requirements/portable/requirements_ik.txt     | 27 ++++++++++++
 .../portable/requirements_ik_cpu_only.txt     | 27 ++++++++++++
 .../portable/requirements_ik_cuda131.txt      | 27 ++++++++++++
 6 files changed, 91 insertions(+), 50 deletions(-)
 create mode 100644 requirements/portable/requirements_ik.txt
 create mode 100644 requirements/portable/requirements_ik_cpu_only.txt
 create mode 100644 requirements/portable/requirements_ik_cuda131.txt

diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 4de591f4..40d9db5d 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -96,10 +96,3 @@ jobs:
     with:
       version: ${{ inputs.version }}
       config: 'os:ubuntu-22.04'
-
-  build_release_ik_macos:
-    name: ik macOS
-    uses: ./.github/workflows/build-portable-release-ik.yml
-    with:
-      version: ${{ inputs.version }}
-      config: 'os:macos-14'
diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml
index 40b4b92f..331a7653 100644
--- a/.github/workflows/build-portable-release-ik-cuda.yml
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@@ -138,14 +138,13 @@ jobs:
             # 3. Prepare requirements file based on CUDA version
             cd "text-generation-webui-${VERSION_CLEAN}"
             if [[ "$CUDA_VERSION" == "13.1" ]]; then
-                REQ_FILE="requirements/portable/requirements_cuda131.txt"
+                REQ_FILE="requirements/portable/requirements_ik_cuda131.txt"
             else
-                REQ_FILE="requirements/portable/requirements.txt"
+                REQ_FILE="requirements/portable/requirements_ik.txt"
             fi
 
-            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
-            sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
-            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true
+            # 4. Inject --ik into start scripts
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
 
             # 5. Install packages
             echo "Installing Python packages from $REQ_FILE..."
diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml
index afb2e763..bf54eb0e 100644
--- a/.github/workflows/build-portable-release-ik.yml
+++ b/.github/workflows/build-portable-release-ik.yml
@@ -1,4 +1,4 @@
-name: Build ik CPU and macOS
+name: Build ik CPU
 
 on:
   workflow_dispatch:
@@ -57,7 +57,7 @@ jobs:
         id: set-matrix
         run: |
           $matrix = @{
-              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
+              'os' = @('ubuntu-22.04', 'windows-2022')
               'pyver' = @("3.13")
           }
 
@@ -110,7 +110,6 @@ jobs:
 
             # Define common variables
             VERSION="${{ inputs.version }}"
-            OS_TYPE="${{ matrix.os }}"
 
             # 1. Set platform-specific variables
             if [[ "$RUNNER_OS" == "Windows" ]]; then
@@ -119,21 +118,7 @@ jobs:
                 PIP_PATH="portable_env/python.exe -m pip"
                 PACKAGES_PATH="portable_env/Lib/site-packages"
                 rm start_linux.sh start_macos.sh
-            elif [[ "$RUNNER_OS" == "macOS" ]]; then
-                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
-                    PLATFORM="macos-x86_64"
-                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz"
-                    REQ_TYPE="apple_intel"
-                else
-                    PLATFORM="macos-arm64"
-                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz"
-                    REQ_TYPE="apple_silicon"
-                fi
-                PIP_PATH="portable_env/bin/python -m pip"
-                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
-                rm start_linux.sh start_windows.bat
             else
-                # Linux case
                 PLATFORM="linux-cpu"
                 PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/bin/python -m pip"
@@ -148,30 +133,13 @@ jobs:
             tar -xzf python-build.tar.gz
             mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
 
-            # 3. Prepare requirements file based on platform
+            # 3. Prepare requirements file
             cd "text-generation-webui-${VERSION_CLEAN}"
-
-            # Select requirements file based on platform
-            if [[ "$RUNNER_OS" == "macOS" ]]; then
-                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
-                    REQ_FILE="requirements/portable/requirements_apple_intel.txt"
-                else
-                    REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
-                fi
-            else
-                REQ_FILE="requirements/portable/requirements_cpu_only.txt"
-            fi
-
+            REQ_FILE="requirements/portable/requirements_ik_cpu_only.txt"
             echo "Using requirements file: $REQ_FILE"
 
-            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
-            if [[ "$RUNNER_OS" == "macOS" ]]; then
-                sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
-                sed -i '' 's/--portable/--portable --ik/g' start_macos.sh
-            else
-                sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
-                sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
-            fi
+            # 4. Inject --ik into start scripts
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
 
             # 5. Install packages
             echo "Installing Python packages from $REQ_FILE..."
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
new file mode 100644
index 00000000..2fa037f7
--- /dev/null
+++ b/requirements/portable/requirements_ik.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
new file mode 100644
index 00000000..b43b51c4
--- /dev/null
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# ik_llama.cpp (CPU only)
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
new file mode 100644
index 00000000..12767285
--- /dev/null
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 8f8b57a029715d07ab164aa22a779ea7ea4619f1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:54:20 -0700
Subject: [PATCH 187/210] Update exllamav3

---
 requirements/full/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 57991c9a..5591c9ca 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -44,7 +44,7 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"

From 6a1f720c7bb9aef73c1c7c4e311460174c5255ec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:58:20 -0700
Subject: [PATCH 188/210] Update transformers

---
 requirements/full/requirements.txt               | 2 +-
 requirements/full/requirements_amd.txt           | 2 +-
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_cpu_only.txt      | 2 +-
 requirements/full/requirements_nowheels.txt      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 5591c9ca..30ee0316 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -25,7 +25,7 @@ sentencepiece
 tensorboard
 torchao==0.15.*
 trafilatura==2.0.0
-transformers==5.3.*
+transformers==5.5.*
 triton-windows==3.5.1.post24; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index bb47ea4b..9edc1d95 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 5750b109..ff8687c1 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d8302d3d..208632e8 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index d3a5c008..4a7e5aaa 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 052085cc..6200589e 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb

From 468cb5cb87bf02f96efcd5acb1d1ac4b08c68273 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:59:28 -0700
Subject: [PATCH 189/210] Update accelerate

---
 requirements/full/requirements.txt               | 2 +-
 requirements/full/requirements_amd.txt           | 2 +-
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_cpu_only.txt      | 2 +-
 requirements/full/requirements_nowheels.txt      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 30ee0316..e5bec6ec 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 bitsandbytes==0.49.*
 datasets
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 9edc1d95..c6b5b2d0 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ff8687c1..ce671f0a 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 208632e8..d12d9f80 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 4a7e5aaa..4066b1af 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 6200589e..7173345a 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*

From 80e81a54cacacbd8aa16ccf312ae0e574e4b416c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 11:11:44 -0700
Subject: [PATCH 190/210] Remove ik macOS wheels from full requirements

---
 requirements/full/requirements_apple_intel.txt   | 1 -
 requirements/full/requirements_apple_silicon.txt | 1 -
 2 files changed, 2 deletions(-)

diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ce671f0a..55a313e9 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -38,4 +38,3 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d12d9f80..a6d34cbb 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -38,4 +38,3 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"

From f6f8f14c8d0993327a2c86dfa3c976a7c1c569fc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:13:39 -0300
Subject: [PATCH 191/210] Security: Fix SSRF in superbooga extensions

---
 extensions/superbooga/download_urls.py   | 3 +++
 extensions/superboogav2/download_urls.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/extensions/superbooga/download_urls.py b/extensions/superbooga/download_urls.py
index 424a9885..b28fea42 100644
--- a/extensions/superbooga/download_urls.py
+++ b/extensions/superbooga/download_urls.py
@@ -2,8 +2,11 @@ import concurrent.futures
 
 import requests
 
+from modules.web_search import _validate_url
+
 
 def download_single(url):
+    _validate_url(url)
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
     }
diff --git a/extensions/superboogav2/download_urls.py b/extensions/superboogav2/download_urls.py
index 5b5a2e17..4d8b98b1 100644
--- a/extensions/superboogav2/download_urls.py
+++ b/extensions/superboogav2/download_urls.py
@@ -5,12 +5,14 @@ import requests
 from bs4 import BeautifulSoup
 
 import extensions.superboogav2.parameters as parameters
+from modules.web_search import _validate_url
 
 from .data_processor import process_and_add_to_collector
 from .utils import create_metadata_source
 
 
 def _download_single(url):
+    _validate_url(url)
     response = requests.get(url, timeout=5)
     if response.status_code == 200:
         return response.content

From 091037ec20743ac6c7bccb75b59743045a692c4a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:13:45 -0300
Subject: [PATCH 192/210] Fix top_logprobs_ids missing for llama.cpp loader

---
 modules/api/completions.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 4eb8fdad..98bcff47 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -299,8 +299,9 @@ def format_completion_logprobs(entries):
             t = item.get('token', '')
             lp = item.get('logprob', item.get('prob', 0))
             top_dict[t] = lp
-            if 'token_id' in item:
-                top_dict_ids[item['token_id']] = lp
+            tid = item.get('token_id', item.get('id'))
+            if tid is not None:
+                top_dict_ids[tid] = lp
         top_logprobs.append(top_dict)
         top_logprobs_ids.append(top_dict_ids if top_dict_ids else None)
 

From a61bde509ff44a0f7662067bc94efd7f103f3162 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:30:02 -0700
Subject: [PATCH 193/210] Update llama.cpp

---
 requirements/full/requirements.txt                   | 8 ++++----
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 8 ++++----
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 15 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index e5bec6ec..f1a953a5 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c6b5b2d0..211600e2 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 55a313e9..54d904dd 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index a6d34cbb..8829eb44 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 4066b1af..0a8cfac6 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,7 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 1180b42d..607c642f 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 57aa6262..f0af64c8 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 894c9199..c5f351c5 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 32b9727f..5287aa25 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 73b72832..038318ab 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index ad96bbe2..d87c741e 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 2fa037f7..3e2471ae 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index b43b51c4..8272b9b6 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # ik_llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 12767285..98ef23d7 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index a5df3ad4..157ad313 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From d84157403a1c8b65f8597302463e46c28a6659d1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:31:44 -0700
Subject: [PATCH 194/210] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index f1a953a5..b38ae848 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 211600e2..7fb3a7d9 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 54d904dd..4a0f764c 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 8829eb44..942d5d71 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 0a8cfac6..6b61dca7 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 7173345a..a4d6cc97 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 607c642f..5aff54b2 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index f0af64c8..0771f53e 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index c5f351c5..427d59b2 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 5287aa25..c47a6ca1 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 038318ab..e491e357 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index d87c741e..5870983a 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 3e2471ae..d11d337d 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index 8272b9b6..c2b69e1c 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 98ef23d7..7f280930 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index e38140ce..322056be 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 157ad313..dfd52be5 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 7aab2fdf9aefb0f14fbf58e132a2a9a5850f8319 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:50:42 -0700
Subject: [PATCH 195/210] API: Improve cache clearing in logprobs

---
 modules/api/completions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 98bcff47..f2282731 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -89,6 +89,7 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         return [{"token": first_token_str, "null_logprob": True}]
 
     import torch
+    from modules.torch_utils import clear_torch_cache
 
     if hasattr(model, 'get_prompt_logits'):
         logits = model.get_prompt_logits(input_ids)
@@ -143,7 +144,7 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         del chunk_logits, chunk_lse, chunk_top_values
 
     del logits
-    torch.cuda.empty_cache()
+    clear_torch_cache()
 
     all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0)
     all_top_indices = torch.cat(all_top_indices_list, dim=0)

From b108c55353e11343a9e8f8566d92e52e868dfa69 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 19:10:41 -0700
Subject: [PATCH 196/210] Fix portable builds not starting due to missing ik
 element

---
 modules/loaders.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index cb1f3d3b..31b1b51a 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -291,16 +291,21 @@ def blacklist_samplers(loader, dynamic_temperature):
 
 @functools.cache
 def get_all_params():
+    from modules import shared
     all_params = set()
     for k in loaders_and_params:
         for el in loaders_and_params[k]:
             all_params.add(el)
 
+    if shared.args.portable:
+        all_params.discard('ik')
+
     return sorted(all_params)
 
 
+@functools.cache
 def list_model_elements():
-    return [
+    elements = [
         'filter_by_loader',
         'loader',
         'cpu_memory',
@@ -346,9 +351,14 @@ def list_model_elements():
         'spec_ngram_size_m',
         'spec_ngram_min_hits',
         'mmproj',
-        'ik',
     ]
 
+    from modules import shared
+    if not shared.args.portable:
+        elements.append('ik')
+
+    return elements
+
 
 def make_loader_params_visible(loader):
     import gradio as gr

From 6e2b70bde60c089f97b0abe97bb1b594cce75357 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 20:26:09 -0700
Subject: [PATCH 197/210] Add Gemma 4 tool-calling support

---
 modules/chat.py         | 57 +++++++++++++++++++++++++++++
 modules/reasoning.py    |  1 +
 modules/tool_parsing.py | 79 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 137 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index edda11b0..818309e6 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -210,6 +210,57 @@ def _expand_tool_sequence(tool_seq):
     return messages
 
 
+def _convert_to_tool_responses(messages):
+    """Convert role:'tool' messages to tool_responses format.
+
+    Templates like Gemma 4 expect tool results as a ``tool_responses``
+    attribute on a message rather than separate ``role: 'tool'`` messages.
+    This function groups consecutive tool messages and rewrites them.
+    """
+    result = []
+    tc_id_to_name = {}
+
+    i = 0
+    while i < len(messages):
+        msg = messages[i]
+
+        if msg.get('tool_calls'):
+            for tc in msg['tool_calls']:
+                tc_id = tc.get('id', '')
+                func_name = tc.get('function', {}).get('name', 'unknown')
+                if tc_id:
+                    tc_id_to_name[tc_id] = func_name
+
+        if msg.get('role') == 'tool':
+            tool_responses = []
+            while i < len(messages) and messages[i].get('role') == 'tool':
+                tool_msg = messages[i]
+                tc_id = tool_msg.get('tool_call_id', '')
+                func_name = tc_id_to_name.get(tc_id, 'unknown')
+
+                content = tool_msg.get('content', '')
+                try:
+                    response = json.loads(content)
+                except (json.JSONDecodeError, ValueError, TypeError):
+                    response = content
+
+                tool_responses.append({
+                    'name': func_name,
+                    'response': response,
+                })
+                i += 1
+
+            result.append({
+                'role': 'tool',
+                'tool_responses': tool_responses,
+            })
+        else:
+            result.append(msg)
+            i += 1
+
+    return result
+
+
 def _format_attachments(attachments, include_text=True):
     """Build image ref and text attachment strings from a list of attachments."""
     attachments_text = ""
@@ -267,6 +318,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
         tools=state['tools'] if 'tools' in state else None,
     )
 
+    active_template_str = state['instruction_template_str'] if state['mode'] == 'instruct' else chat_template_str
+    uses_tool_responses = 'tool_responses' in active_template_str
+
     messages = []
 
     if state['mode'] == 'instruct':
@@ -503,6 +557,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
         return prompt
 
+    if uses_tool_responses:
+        messages = _convert_to_tool_responses(messages)
+
     prompt = make_prompt(messages)
 
     # Handle truncation
diff --git a/modules/reasoning.py b/modules/reasoning.py
index aa1939b8..4a7cfa79 100644
--- a/modules/reasoning.py
+++ b/modules/reasoning.py
@@ -7,6 +7,7 @@ THINKING_FORMATS = [
     ('<|channel|>analysis<|message|>', '<|end|>', '<|channel|>final<|message|>'),
     ('<|channel|>commentary<|message|>', '<|end|>', '<|channel|>final<|message|>'),
     ('<seed:think>', '</seed:think>', None),
+    ('<|channel>thought', '<channel|>', None),  # Gemma 4
     ('<|think|>', '<|end|>', '<|content|>'),  # Solar Open
     # ('Thinking Process:', '</think>', None),  # Qwen3.5 verbose thinking outside tags -- removed: too prone to false positives in streaming
     (None, '</think>', None),  # End-only variant (e.g., Qwen3-next)
diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
index ec49f77f..45da25c9 100644
--- a/modules/tool_parsing.py
+++ b/modules/tool_parsing.py
@@ -27,6 +27,7 @@ TOOL_CALL_OPENING_MARKERS = [
     '[TOOL_CALLS]',
     'to=functions.',
     '<|channel|>commentary',
+    '<|tool_call>call:',
 ]
 
 
@@ -400,6 +401,78 @@ def _parse_glm_tool_calls(answer: str, tool_names: list[str]):
     return matches, start_pos
 
 
+def _extract_gemma4_balanced(text, start):
+    """Extract balanced braces from Gemma 4 format, using <|"|> as string delimiters."""
+    if start >= len(text) or text[start] != '{':
+        return None
+    depth = 0
+    in_string = False
+    quote_token = '<|"|>'
+    quote_len = len(quote_token)
+    i = start
+    while i < len(text):
+        if text[i:i + quote_len] == quote_token:
+            in_string = not in_string
+            i += quote_len
+            continue
+        if in_string:
+            i += 1
+            continue
+        c = text[i]
+        if c == '{':
+            depth += 1
+        elif c == '}':
+            depth -= 1
+            if depth == 0:
+                return text[start:i + 1]
+        i += 1
+    return None
+
+
+def _parse_gemma4_tool_calls(answer: str, tool_names: list[str]):
+    """Parse Gemma 4-style tool calls.
+
+    Format:
+        <|tool_call>call:func_name{key:<|"|>value<|"|>,...}<tool_call|>
+
+    Values use <|"|> tokens instead of standard JSON quotes, and keys are
+    bare identifiers.
+    """
+    matches = []
+    start_pos = None
+
+    for m in re.finditer(r'<\|tool_call>call:([^\s{]+)\s*', answer):
+        func_name = m.group(1).strip()
+        if func_name not in tool_names:
+            continue
+
+        brace_start = m.end()
+        if brace_start >= len(answer) or answer[brace_start] != '{':
+            continue
+
+        content = _extract_gemma4_balanced(answer, brace_start)
+        if content is None:
+            continue
+
+        # Convert to JSON: split on <|"|> tokens so that key quoting
+        # only applies outside string values (even-indexed parts),
+        # then rejoin with real quotes.
+        parts = content.split('<|"|>')
+        for idx in range(0, len(parts), 2):
+            parts[idx] = re.sub(r'(^|[{,\[])\s*(\w+)\s*:', r'\1"\2":', parts[idx])
+        json_str = '"'.join(parts)
+
+        try:
+            arguments = json.loads(json_str)
+            if start_pos is None:
+                start_pos = m.start()
+            matches.append(_make_tool_call(func_name, arguments))
+        except (json.JSONDecodeError, ValueError):
+            pass
+
+    return matches, start_pos
+
+
 def _parse_pythonic_tool_calls(answer: str, tool_names: list[str]):
     """Parse pythonic-style tool calls used by Llama 4 and similar models.
 
@@ -472,6 +545,11 @@ TOOL_CALL_FORMATS = [
         'parser': _parse_channel_tool_calls,
         'markers': ['to=functions.', '<|channel|>commentary'],
     },
+    {
+        'template_hints': ['<|tool_call>call:'],
+        'parser': _parse_gemma4_tool_calls,
+        'markers': ['<|tool_call>call:'],
+    },
     {
         'template_hints': ['minimax:tool_call'],
         'parser': _parse_minimax_tool_calls,
@@ -504,6 +582,7 @@ ALL_PARSERS = [
     _parse_deep_seek_tool_calls,
     _parse_kimi_tool_calls,
     _parse_channel_tool_calls,
+    _parse_gemma4_tool_calls,
     _parse_minimax_tool_calls,
     _parse_glm_tool_calls,
     _parse_xml_param_tool_calls,

From 42dfcdfc5b50333c40a6adda0f4c8672508212cb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 20:46:27 -0700
Subject: [PATCH 198/210] API: Add warning about vanilla llama-server not
 supporting prompt logprobs + instructions

---
 modules/llama_cpp_server.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 34080466..2d873f00 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -333,6 +333,12 @@ class LlamaServer:
 
         prompt_probs = result.get("prompt_probabilities", [])
         if not prompt_probs:
+            logger.warning(
+                "The llama.cpp server did not return prompt probabilities. "
+                "This feature requires a custom build with prompt_logprobs support. "
+                "See: https://github.com/oobabooga/llama.cpp/tree/prompt-logprobs "
+                "or https://github.com/oobabooga/ik_llama.cpp/tree/prompt-logprobs"
+            )
             return []
 
         # Null first token (no conditioning context); use empty string for BOS

From a1cb5b5dc05d2540640069b9549dd93557c81a16 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 21:56:06 -0700
Subject: [PATCH 199/210] llama.cpp: Disable jinja by default (we use Python
 jinja, not cpp jinja)

This was causing template compilation issues with qwen models.
---
 modules/llama_cpp_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2d873f00..a4390adb 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -418,6 +418,7 @@ class LlamaServer:
             "--ubatch-size", str(shared.args.ubatch_size),
             "--port", str(self.port),
             "--no-webui",
+            "--no-jinja",
             "--flash-attn", "on",
         ]
 

From 000d776967f0a73684b85c9d052a738dba073fb6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 05:49:03 -0700
Subject: [PATCH 200/210] Revert "llama.cpp: Disable jinja by default (we use
 Python jinja, not cpp jinja)"

This reverts commit a1cb5b5dc05d2540640069b9549dd93557c81a16.
---
 modules/llama_cpp_server.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index a4390adb..2d873f00 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -418,7 +418,6 @@ class LlamaServer:
             "--ubatch-size", str(shared.args.ubatch_size),
             "--port", str(self.port),
             "--no-webui",
-            "--no-jinja",
             "--flash-attn", "on",
         ]
 

From 66d1a22c733b04c38d89a128a7eeacd8e142e629 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 05:56:36 -0700
Subject: [PATCH 201/210] Fix crash when no model is selected (None passed to
 resolve_model_path)

---
 modules/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/utils.py b/modules/utils.py
index b01953ee..c4acf714 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -105,6 +105,9 @@ def resolve_model_path(model_name_or_path, image_model=False):
     before the default models directory.
     """
 
+    if model_name_or_path is None:
+        raise FileNotFoundError("No model specified.")
+
     path_candidate = Path(model_name_or_path)
     if path_candidate.exists():
         return path_candidate

From 8bba9ecc3fc0afc044a1f6810f014f721dbb7809 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 05:58:05 -0700
Subject: [PATCH 202/210] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index b38ae848..91d27d86 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 7fb3a7d9..eea869b1 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 4a0f764c..391973b7 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 942d5d71..4d0ffe29 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 6b61dca7..44e54eaa 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index a4d6cc97..41d6aad6 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 5aff54b2..91b58e0d 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 0771f53e..36d6dcb1 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 427d59b2..0d882b83 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index c47a6ca1..d79832e5 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index e491e357..3e1de9c9 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 5870983a..40e68d99 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index d11d337d..9e61ad3f 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index c2b69e1c..cdd1218f 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 7f280930..b7422758 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 322056be..372e718b 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index dfd52be5..3e539988 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 95d6c53e13673defecff6def4aead4c7ea157911 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 07:30:48 -0700
Subject: [PATCH 203/210] Revert "API: Add warning about vanilla llama-server
 not supporting prompt logprobs + instructions"

This reverts commit 42dfcdfc5b50333c40a6adda0f4c8672508212cb.
---
 modules/llama_cpp_server.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2d873f00..34080466 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -333,12 +333,6 @@ class LlamaServer:
 
         prompt_probs = result.get("prompt_probabilities", [])
         if not prompt_probs:
-            logger.warning(
-                "The llama.cpp server did not return prompt probabilities. "
-                "This feature requires a custom build with prompt_logprobs support. "
-                "See: https://github.com/oobabooga/llama.cpp/tree/prompt-logprobs "
-                "or https://github.com/oobabooga/ik_llama.cpp/tree/prompt-logprobs"
-            )
             return []
 
         # Null first token (no conditioning context); use empty string for BOS

From 131a9a0140baeef90061bb97065d32b23385e142 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 09:15:03 -0700
Subject: [PATCH 204/210] Update llama.cpp

---
 requirements/full/requirements.txt                   | 8 ++++----
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 8 ++++----
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 15 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 91d27d86..8816f76e 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index eea869b1..466b680f 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 391973b7..49a948c7 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 4d0ffe29..508c137a 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 44e54eaa..17ecbd61 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,7 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 91b58e0d..73d3f3b6 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 36d6dcb1..95c23424 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 0d882b83..4d18875b 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index d79832e5..a181212b 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 3e1de9c9..5ddd53e1 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 40e68d99..3ab238ac 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 9e61ad3f..624fbe5a 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index cdd1218f..c1ab1758 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # ik_llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index b7422758..6d17200d 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 3e539988..cdeb2b79 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 8e8e1ba8984cc3cef4b4c0d88e7c9eb7977dd3fe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 09:50:15 -0700
Subject: [PATCH 205/210] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 8816f76e..3b5501f4 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 466b680f..eb7f8618 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 49a948c7..e11522a9 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 508c137a..76d8a709 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 17ecbd61..8d4df234 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 41d6aad6..cecc2a25 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 73d3f3b6..42db46a4 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 95c23424..5e0b589b 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 4d18875b..711c68f9 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index a181212b..f1bbccf0 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 5ddd53e1..dc2807f2 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 3ab238ac..6d34b894 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 624fbe5a..5b3bac83 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index c1ab1758..b8e0897d 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 6d17200d..fd623b0b 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 372e718b..92c910ac 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index cdeb2b79..bc17dda9 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 6b66da84d2dbccf63ffe79939edf92ad935bb3ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 10:01:51 -0700
Subject: [PATCH 206/210] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 3b5501f4..4d9d243c 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index eb7f8618..b6caf320 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index e11522a9..ae2f6263 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 76d8a709..f4783d2b 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 8d4df234..11d670b6 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index cecc2a25..d4b1ca80 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 42db46a4..901d1494 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 5e0b589b..5705c64c 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 711c68f9..3c3deaed 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index f1bbccf0..3f1e814c 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index dc2807f2..3bfed7f8 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 6d34b894..dd7059c8 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 5b3bac83..4fdd10fe 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index b8e0897d..1b377463 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index fd623b0b..b85607d3 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 92c910ac..39628ee6 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index bc17dda9..16aa5593 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 5fb8c4fbd6d8112429335b48c93a6fe941f4c5e3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 11:02:00 -0700
Subject: [PATCH 207/210] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 4d9d243c..b7a5ca97 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index b6caf320..2c627585 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ae2f6263..7e3fc35f 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index f4783d2b..2603201d 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 11d670b6..fe3bf3ba 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index d4b1ca80..acae301e 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 901d1494..56795843 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 5705c64c..abaa1338 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 3c3deaed..b22a03d9 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 3f1e814c..97c5903c 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 3bfed7f8..57e92f74 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index dd7059c8..1f7d27a7 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 4fdd10fe..65f6a004 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index 1b377463..0a82adb7 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index b85607d3..3d812045 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 39628ee6..91bef10b 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 16aa5593..7c61f0cc 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 8ecdb41078cfaf54fa0be66d54cf6e3911936b68 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Fri, 3 Apr 2026 19:36:50 -0300
Subject: [PATCH 208/210] fix(security): sanitize filenames in all prompt file
 operations (CWE-22) (#7462)

---------

Co-authored-by: Alex Chen <ffulbtech@gmail.com>
---
 modules/prompts.py     | 2 ++
 modules/ui_default.py  | 5 ++++-
 modules/ui_notebook.py | 6 +++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/modules/prompts.py b/modules/prompts.py
index d107ce5a..85dc32e3 100644
--- a/modules/prompts.py
+++ b/modules/prompts.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 
 from modules import shared, utils
+from modules.utils import sanitize_filename
 from modules.text_generation import get_encoded_length
 
 
@@ -18,6 +19,7 @@ def load_prompt(fname):
 
         return initial_content
 
+    fname = sanitize_filename(fname)
     file_path = shared.user_data_dir / 'logs' / 'notebook' / f'{fname}.txt'
     if file_path.exists():
         with open(file_path, 'r', encoding='utf-8') as f:
diff --git a/modules/ui_default.py b/modules/ui_default.py
index 2c367cca..48cb2fc2 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -10,7 +10,7 @@ from modules.text_generation import (
     stop_everything_event
 )
 from modules.ui_notebook import store_notebook_state_and_debounce
-from modules.utils import gradio
+from modules.utils import gradio, sanitize_filename
 
 inputs = ('textbox-default', 'interface_state')
 outputs = ('output_textbox', 'html-default')
@@ -167,6 +167,7 @@ def handle_new_prompt():
 
 
 def handle_delete_prompt_confirm_default(prompt_name):
+    prompt_name = sanitize_filename(prompt_name)
     available_prompts = utils.get_available_prompts()
     current_index = available_prompts.index(prompt_name) if prompt_name in available_prompts else 0
 
@@ -199,6 +200,8 @@ def handle_rename_prompt_click_default(current_name):
 
 
 def handle_rename_prompt_confirm_default(new_name, current_name):
+    new_name = sanitize_filename(new_name)
+    current_name = sanitize_filename(current_name)
     old_path = shared.user_data_dir / "logs" / "notebook" / f"{current_name}.txt"
     new_path = shared.user_data_dir / "logs" / "notebook" / f"{new_name}.txt"
 
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index f550e646..88f00ac5 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -11,7 +11,7 @@ from modules.text_generation import (
     get_token_ids,
     stop_everything_event
 )
-from modules.utils import gradio
+from modules.utils import gradio, sanitize_filename
 
 _notebook_file_lock = threading.Lock()
 _notebook_auto_save_timer = None
@@ -202,6 +202,7 @@ def handle_new_prompt():
 
 
 def handle_delete_prompt_confirm_notebook(prompt_name):
+    prompt_name = sanitize_filename(prompt_name)
     available_prompts = utils.get_available_prompts()
     current_index = available_prompts.index(prompt_name) if prompt_name in available_prompts else 0
 
@@ -233,6 +234,8 @@ def handle_rename_prompt_click_notebook(current_name):
 
 
 def handle_rename_prompt_confirm_notebook(new_name, current_name):
+    new_name = sanitize_filename(new_name)
+    current_name = sanitize_filename(current_name)
     old_path = shared.user_data_dir / "logs" / "notebook" / f"{current_name}.txt"
     new_path = shared.user_data_dir / "logs" / "notebook" / f"{new_name}.txt"
 
@@ -249,6 +252,7 @@ def handle_rename_prompt_confirm_notebook(new_name, current_name):
 
 def autosave_prompt(text, prompt_name):
     """Automatically save the text to the selected prompt file"""
+    prompt_name = sanitize_filename(prompt_name)
     if prompt_name and text.strip():
         prompt_path = shared.user_data_dir / "logs" / "notebook" / f"{prompt_name}.txt"
         prompt_path.parent.mkdir(parents=True, exist_ok=True)

From fc35acab9b07f1b0dc57b89a7cb459894aa44c5b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 16:56:15 -0700
Subject: [PATCH 209/210] API: Fix tool call parser crash on non-dict JSON
 output

---
 modules/tool_parsing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
index 45da25c9..919e523a 100644
--- a/modules/tool_parsing.py
+++ b/modules/tool_parsing.py
@@ -699,6 +699,8 @@ def parse_tool_call(answer: str, tool_names: list[str], return_prefix: bool = Fa
                 if not isinstance(candidates, list):
                     candidates = [candidates]
                 for candidate_dict in candidates:
+                    if not isinstance(candidate_dict, dict):
+                        continue
                     checked_candidate = check_and_sanitize_tool_call_candidate(candidate_dict, tool_names)
                     if checked_candidate is not None:
                         matches.append(checked_candidate)

From 2fbaee58cd7c65c22267410f8a77b6c04b3ee954 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 20:54:28 -0700
Subject: [PATCH 210/210] Add Windows + ROCm portable builds

---
 .github/workflows/build-everything-tgw.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 40d9db5d..0b65dfd6 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -41,6 +41,13 @@ jobs:
       version: ${{ inputs.version }}
       config: 'os:ubuntu-22.04'
 
+  build_release_rocm_windows:
+    name: ROCm Windows
+    uses: ./.github/workflows/build-portable-release-rocm.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
   build_release_rocm_linux:
     name: ROCm Linux
     uses: ./.github/workflows/build-portable-release-rocm.yml