From fd41f2fafcc6e286b69ba7efe2f5214d89f834ca Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:18:56 -0700
Subject: [PATCH 01/58] Update llama.cpp
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_amd_noavx2.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 6 +++---
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 6 +++---
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/portable/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
requirements/portable/requirements_vulkan_noavx2.txt | 4 ++--
16 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 9f906b26..f1d40000 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 70e031b8..437a10d9 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 81556326..0170d951 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 7b9d3650..7b369c40 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 0fc9162f..1d1f44e0 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 3565a994..e63e9705 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 64c17416..c03a718a 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 2b162308..70b73e83 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 943ea600..ab91a763 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 394b89b6..0faa6502 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index cffe3aea..e1024942 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index d274e2c8..3d2b6338 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 47ec086e..a95b30b3 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 9a0a3694..2eb7f597 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 45e96da9..3244c9d4 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 9183562e..685c7d1c 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From f247c2ae62fa246414bededf901df058665f819b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:46:02 -0700
Subject: [PATCH 02/58] Make --model work with absolute paths, eg --model
/tmp/gemma-3-270m-it-IQ4_NL.gguf
---
modules/models.py | 12 ++++++++++--
modules/models_settings.py | 16 +++++++++-------
modules/utils.py | 13 +++++++++++++
server.py | 13 +++----------
4 files changed, 35 insertions(+), 19 deletions(-)
diff --git a/modules/models.py b/modules/models.py
index ca3d184f..cae88ac5 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -5,6 +5,7 @@ from pathlib import Path
import modules.shared as shared
from modules.logging_colors import logger
from modules.models_settings import get_model_metadata
+from modules.utils import resolve_model_path
last_generation_time = time.time()
@@ -69,17 +70,24 @@ def load_model(model_name, loader=None):
def llama_cpp_server_loader(model_name):
from modules.llama_cpp_server import LlamaServer
- path = Path(f'{shared.args.model_dir}/{model_name}')
+ path = resolve_model_path(model_name)
+
if path.is_file():
model_file = path
else:
- model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
+ gguf_files = sorted(path.glob('*.gguf'))
+ if not gguf_files:
+ logger.error(f"No .gguf models found in the directory: {path}")
+ return None, None
+
+ model_file = gguf_files[0]
try:
model = LlamaServer(model_file)
return model, model
except Exception as e:
logger.error(f"Error loading the model with llama.cpp: {str(e)}")
+ return None, None
def transformers_loader(model_name):
diff --git a/modules/models_settings.py b/modules/models_settings.py
index c325fa0c..aa16fdb9 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -10,6 +10,7 @@ import yaml
from modules import chat, loaders, metadata_gguf, shared, ui
from modules.logging_colors import logger
+from modules.utils import resolve_model_path
def get_fallback_settings():
@@ -26,6 +27,7 @@ def get_fallback_settings():
def get_model_metadata(model):
+ model_path = resolve_model_path(model)
model_settings = {}
# Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
@@ -35,7 +37,7 @@ def get_model_metadata(model):
for k in settings[pat]:
model_settings[k] = settings[pat][k]
- path = Path(f'{shared.args.model_dir}/{model}/config.json')
+ path = model_path / 'config.json'
if path.exists():
hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read())
else:
@@ -51,7 +53,7 @@ def get_model_metadata(model):
# GGUF metadata
if model_settings['loader'] == 'llama.cpp':
- path = Path(f'{shared.args.model_dir}/{model}')
+ path = model_path
if path.is_file():
model_file = path
else:
@@ -130,18 +132,18 @@ def get_model_metadata(model):
model_settings['bf16'] = True
# Try to find the Jinja instruct template
- path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
+ path = model_path / 'tokenizer_config.json'
template = None
# 1. Prioritize reading from chat_template.jinja if it exists
- jinja_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.jinja'
+ jinja_path = model_path / 'chat_template.jinja'
if jinja_path.exists():
with open(jinja_path, 'r', encoding='utf-8') as f:
template = f.read()
# 2. If no .jinja file, try chat_template.json
if template is None:
- json_template_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.json'
+ json_template_path = model_path / 'chat_template.json'
if json_template_path.exists():
with open(json_template_path, 'r', encoding='utf-8') as f:
json_data = json.load(f)
@@ -201,7 +203,7 @@ def get_model_metadata(model):
def infer_loader(model_name, model_settings, hf_quant_method=None):
- path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
+ path_to_model = resolve_model_path(model_name)
if not path_to_model.exists():
loader = None
elif shared.args.portable:
@@ -357,7 +359,7 @@ def get_model_size_mb(model_file: Path) -> float:
def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
- model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
+ model_file = resolve_model_path(gguf_file)
metadata = load_gguf_metadata_with_cache(model_file)
size_in_mb = get_model_size_mb(model_file)
diff --git a/modules/utils.py b/modules/utils.py
index 4927ef04..e8d23a02 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -86,6 +86,19 @@ def check_model_loaded():
return True, None
+def resolve_model_path(model_name_or_path):
+ """
+ Resolves a model path, checking for a direct path
+ before the default models directory.
+ """
+
+ path_candidate = Path(model_name_or_path)
+ if path_candidate.exists():
+ return path_candidate
+ else:
+ return Path(f'{shared.args.model_dir}/{model_name_or_path}')
+
+
def get_available_models():
# Get all GGUF files
gguf_files = get_available_ggufs()
diff --git a/server.py b/server.py
index 7ce3c208..e6687a3c 100644
--- a/server.py
+++ b/server.py
@@ -283,21 +283,14 @@ if __name__ == "__main__":
# If any model has been selected, load it
if shared.model_name != 'None':
- p = Path(shared.model_name)
- if p.exists():
- model_name = p.parts[-1]
- shared.model_name = model_name
- else:
- model_name = shared.model_name
-
- model_settings = get_model_metadata(model_name)
+ model_settings = get_model_metadata(shared.model_name)
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
shared.args.loader,
- model_name,
+ shared.model_name,
model_settings['gpu_layers'],
shared.args.ctx_size,
shared.args.cache_type,
@@ -308,7 +301,7 @@ if __name__ == "__main__":
shared.args.gpu_layers = adjusted_layers
# Load the model
- shared.model, shared.tokenizer = load_model(model_name)
+ shared.model, shared.tokenizer = load_model(shared.model_name)
if shared.args.lora:
add_lora_to_model(shared.args.lora)
From 7fe8da89448f795838a34adc0b2246855127e4a2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 22 Aug 2025 14:42:56 -0700
Subject: [PATCH 03/58] Minor simplification after
f247c2ae62fa246414bededf901df058665f819b
---
modules/llama_cpp_server.py | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 5953803a..8f1924cb 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -20,6 +20,7 @@ from modules.image_utils import (
convert_pil_to_base64
)
from modules.logging_colors import logger
+from modules.utils import resolve_model_path
llamacpp_valid_cache_types = {"fp16", "q8_0", "q4_0"}
@@ -351,14 +352,12 @@ class LlamaServer:
if path.exists():
cmd += ["--mmproj", str(path)]
if shared.args.model_draft not in [None, 'None']:
- path = Path(shared.args.model_draft)
- if not path.exists():
- path = Path(f'{shared.args.model_dir}/{shared.args.model_draft}')
+ path = resolve_model_path(shared.args.model_draft)
if path.is_file():
model_file = path
else:
- model_file = sorted(Path(f'{shared.args.model_dir}/{shared.args.model_draft}').glob('*.gguf'))[0]
+ model_file = sorted(path.glob('*.gguf'))[0]
cmd += ["--model-draft", model_file]
if shared.args.draft_max > 0:
From 8be798e15f48bd1f498d2c609ddf2f31cf22524b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 24 Aug 2025 12:19:19 -0700
Subject: [PATCH 04/58] llama.cpp: Fix stderr deadlock while loading some
multimodal models
---
modules/llama_cpp_server.py | 70 ++++++++++++++++++++++++-------------
1 file changed, 45 insertions(+), 25 deletions(-)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 8f1924cb..e3dd43b4 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -410,8 +410,7 @@ class LlamaServer:
self.process = subprocess.Popen(
cmd,
stderr=subprocess.PIPE,
- text=True,
- bufsize=1,
+ bufsize=0,
env=env
)
@@ -473,34 +472,55 @@ def filter_stderr_with_progress(process_stderr):
last_was_progress = False
try:
- for raw in iter(process_stderr.readline, ''):
- line = raw.rstrip('\r\n')
- match = progress_re.search(line)
+ # Read in binary mode and decode manually
+ buffer = b""
+ while True:
+ # Read chunks aggressively to prevent buffer overflow
+ chunk = process_stderr.read(4096)
+ if not chunk:
+ break
- if match:
- progress = float(match.group(1))
+ buffer += chunk
- # Extract just the part from "prompt processing" onwards
- prompt_processing_idx = line.find('prompt processing')
- if prompt_processing_idx != -1:
- display_line = line[prompt_processing_idx:]
- else:
- display_line = line # fallback to full line
+ # Process complete lines
+ while b'\n' in buffer:
+ line_bytes, buffer = buffer.split(b'\n', 1)
+ try:
+ line = line_bytes.decode('utf-8', errors='replace').strip('\r\n')
+ if line: # Process non-empty lines
+ match = progress_re.search(line)
- # choose carriage return for in-progress or newline at completion
- end_char = '\r' if progress < 1.0 else '\n'
- print(display_line, end=end_char, file=sys.stderr, flush=True)
- last_was_progress = (progress < 1.0)
+ if match:
+ progress = float(match.group(1))
- # skip noise lines
- elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line):
- # if we were in progress, finish that line first
- if last_was_progress:
- print(file=sys.stderr)
+ # Extract just the part from "prompt processing" onwards
+ prompt_processing_idx = line.find('prompt processing')
+ if prompt_processing_idx != -1:
+ display_line = line[prompt_processing_idx:]
+ else:
+ display_line = line # fallback to full line
- print(line, file=sys.stderr, flush=True)
- last_was_progress = False
+ # choose carriage return for in-progress or newline at completion
+ end_char = '\r' if progress < 1.0 else '\n'
+ print(display_line, end=end_char, file=sys.stderr, flush=True)
+ last_was_progress = (progress < 1.0)
+
+ # skip noise lines
+ elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line):
+ # if we were in progress, finish that line first
+ if last_was_progress:
+ print(file=sys.stderr)
+
+ print(line, file=sys.stderr, flush=True)
+ last_was_progress = False
+
+ except Exception:
+ continue
except (ValueError, IOError):
- # silently ignore broken output or IO errors
pass
+ finally:
+ try:
+ process_stderr.close()
+ except:
+ pass
From 2478294c06dac4ec749f9d37532e1e258a322ee6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 24 Aug 2025 12:37:41 -0700
Subject: [PATCH 05/58] UI: Preload the instruct and chat fonts
---
modules/block_requests.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/modules/block_requests.py b/modules/block_requests.py
index dc1ee467..618b4bd6 100644
--- a/modules/block_requests.py
+++ b/modules/block_requests.py
@@ -38,7 +38,6 @@ def my_get(url, **kwargs):
return requests.api.request('get', 'http://127.0.0.1/', **kwargs)
-# Kindly provided by our friend WizardLM-30B
def my_open(*args, **kwargs):
filename = str(args[0])
if filename.endswith(('index.html', 'share.html')):
@@ -52,6 +51,10 @@ def my_open(*args, **kwargs):
file_contents = file_contents.replace('cdnjs.cloudflare.com', '127.0.0.1')
file_contents = file_contents.replace(
'',
+ '\n '
+ '\n '
+ '\n '
+ '\n '
'\n '
'\n '
'\n '
From 1f77427088a1487fd9afc8d43282aba6d51557b6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 24 Aug 2025 19:56:22 -0700
Subject: [PATCH 06/58] Update llama.cpp
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_amd_noavx2.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 6 +++---
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 6 +++---
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/portable/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
requirements/portable/requirements_vulkan_noavx2.txt | 4 ++--
16 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index f1d40000..c53e7722 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 437a10d9..776dbc7b 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 0170d951..7205a72e 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 7b369c40..f4e00c5b 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 1d1f44e0..0b28ad21 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index e63e9705..fcbb48f0 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index c03a718a..1705791e 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 70b73e83..7e951219 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index ab91a763..9eb8b33c 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 0faa6502..0a27f61b 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index e1024942..79674e5a 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 3d2b6338..3ebb7d8b 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index a95b30b3..81b78fe6 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 2eb7f597..58dc529a 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 3244c9d4..5ad8ede1 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 685c7d1c..0adf6e48 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From 3bc48014a5bfdf633b814c23bbb5b42212293b06 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 16:48:21 -0700
Subject: [PATCH 07/58] chat.py code simplifications
---
modules/chat.py | 29 ++++++++++++-----------------
1 file changed, 12 insertions(+), 17 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index ab6b43c0..022ab8c9 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -213,13 +213,11 @@ def generate_chat_prompt(user_input, state, **kwargs):
if assistant_msg:
# Handle GPT-OSS as a special case
if '<|channel|>analysis<|message|>' in assistant_msg or '<|channel|>final<|message|>' in assistant_msg:
-
thinking_content = ""
final_content = ""
# Extract analysis content if present
if '<|channel|>analysis<|message|>' in assistant_msg:
- # Split the message by the analysis tag to isolate the content that follows
parts = assistant_msg.split('<|channel|>analysis<|message|>', 1)
if len(parts) > 1:
# The content is everything after the tag
@@ -240,7 +238,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
# Extract final content if present
final_tag_to_find = '<|channel|>final<|message|>'
if final_tag_to_find in assistant_msg:
- # Split the message by the final tag to isolate the content that follows
parts = assistant_msg.split(final_tag_to_find, 1)
if len(parts) > 1:
# The content is everything after the tag
@@ -261,6 +258,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
messages.insert(insert_pos, msg_dict)
else:
+ # Default case (used by all other models)
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
@@ -286,18 +284,17 @@ def generate_chat_prompt(user_input, state, **kwargs):
else:
attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
- if image_refs or attachments_text:
- enhanced_user_msg = user_msg
- if image_refs:
- enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}"
- if attachments_text:
- enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
+ if image_refs:
+ enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}"
+ if attachments_text:
+ enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
+ # Handle the current user input
user_input = user_input.strip()
- # Check if we have attachments even with empty input
+ # Check if we have attachments
has_attachments = False
if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
current_row_idx = len(history)
@@ -306,7 +303,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
if (user_input or has_attachments) and not impersonate and not _continue:
# For the current user input being processed, check if we need to add attachments
- if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
+ if len(history_data.get('metadata', {})) > 0:
current_row_idx = len(history)
user_key = f"user_{current_row_idx}"
@@ -325,12 +322,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
else:
attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
- if image_refs or attachments_text:
- user_input = user_input
- if image_refs:
- user_input = f"{image_refs}\n\n{user_input}"
- if attachments_text:
- user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
+ if image_refs:
+ user_input = f"{image_refs}\n\n{user_input}"
+ if attachments_text:
+ user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
messages.append({"role": "user", "content": user_input})
From d08800c359bbc90172294a78f569cf284148d4b4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 17:03:37 -0700
Subject: [PATCH 08/58] chat.py improvements
---
modules/chat.py | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 022ab8c9..cd82b813 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -336,10 +336,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
prompt = renderer(messages=messages)
if state['mode'] == 'chat-instruct':
- outer_messages = []
- if state['custom_system_message'].strip() != '':
- outer_messages.append({"role": "system", "content": state['custom_system_message']})
-
command = state['chat-instruct_command']
command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1'])
command = command.replace('<|prompt|>', prompt)
@@ -353,29 +349,31 @@ def generate_chat_prompt(user_input, state, **kwargs):
if not impersonate:
prefix = apply_extensions('bot_prefix', prefix, state)
+ suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
+
+ outer_messages = []
+ if state['custom_system_message'].strip() != '':
+ outer_messages.append({"role": "system", "content": state['custom_system_message']})
+
outer_messages.append({"role": "user", "content": command})
outer_messages.append({"role": "assistant", "content": prefix})
prompt = instruct_renderer(messages=outer_messages)
- suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
if len(suffix) > 0:
prompt = prompt[:-len(suffix)]
else:
# Handle GPT-OSS as a special case when continuing
+ # (otherwise the thinking block gets removed...)
if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']:
last_message_to_continue = messages[-1]
prompt = renderer(messages=messages[:-1])
- # Start the assistant turn wrapper
assistant_reply_so_far = "<|start|>assistant"
-
if 'thinking' in last_message_to_continue:
assistant_reply_so_far += f"<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>"
assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}"
-
prompt += assistant_reply_so_far
-
else:
prompt = renderer(messages=messages)
if _continue:
From f919cdf881ee45641d588fc664d2c4fe1cc71c4a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 17:20:51 -0700
Subject: [PATCH 09/58] chat.py code simplifications
---
modules/chat.py | 128 +++++++++++++++---------------------------------
1 file changed, 40 insertions(+), 88 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index cd82b813..023f5a3e 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -86,36 +86,6 @@ yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
-def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True):
- '''
- Given a Jinja template, reverse-engineers the prefix and the suffix for
- an assistant message (if impersonate=False) or an user message
- (if impersonate=True)
- '''
-
- if impersonate:
- messages = [
- {"role": "user", "content": "<<|user-message-1|>>"},
- {"role": "user", "content": "<<|user-message-2|>>"},
- ]
- else:
- messages = [
- {"role": "assistant", "content": "<<|user-message-1|>>"},
- {"role": "assistant", "content": "<<|user-message-2|>>"},
- ]
-
- prompt = renderer(messages=messages)
-
- suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0]
- suffix = prompt.split("<<|user-message-2|>>")[1]
- prefix = suffix_plus_prefix[len(suffix):]
-
- if strip_trailing_spaces:
- prefix = prefix.rstrip(' ')
-
- return prefix, suffix
-
-
def get_thinking_suppression_string(template):
"""
Determines what string needs to be added to suppress thinking mode
@@ -341,26 +311,16 @@ def generate_chat_prompt(user_input, state, **kwargs):
command = command.replace('<|prompt|>', prompt)
command = replace_character_names(command, state['name1'], state['name2'])
- if _continue:
- prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0]
- prefix += messages[-1]["content"]
- else:
- prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
- if not impersonate:
- prefix = apply_extensions('bot_prefix', prefix, state)
-
- suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
-
outer_messages = []
if state['custom_system_message'].strip() != '':
outer_messages.append({"role": "system", "content": state['custom_system_message']})
outer_messages.append({"role": "user", "content": command})
- outer_messages.append({"role": "assistant", "content": prefix})
- prompt = instruct_renderer(messages=outer_messages)
- if len(suffix) > 0:
- prompt = prompt[:-len(suffix)]
+ prompt = instruct_renderer(
+ messages=outer_messages,
+ add_generation_prompt=True
+ )
else:
# Handle GPT-OSS as a special case when continuing
# (otherwise the thinking block gets removed...)
@@ -375,29 +335,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}"
prompt += assistant_reply_so_far
else:
- prompt = renderer(messages=messages)
- if _continue:
- suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
- if len(suffix) > 0:
- prompt = prompt[:-len(suffix)]
- else:
- prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
-
- # Handle GPT-OSS as a special case when not continuing
- if '<|channel|>final<|message|>' in state['instruction_template_str']:
- if prefix.endswith("<|channel|>final<|message|>"):
- prefix = prefix[:-len("<|channel|>final<|message|>")]
-
- if impersonate:
- prefix += "<|message|>"
-
- if state['mode'] == 'chat' and not impersonate:
- prefix = apply_extensions('bot_prefix', prefix, state)
-
- prompt += prefix
-
- if state['mode'] == 'instruct' and 'enable_thinking' in state['instruction_template_str'] and not any((_continue, impersonate, state['enable_thinking'])):
- prompt += get_thinking_suppression_string(instruction_template)
+ prompt = renderer(
+ messages=messages,
+ add_generation_prompt=True
+ )
return prompt
@@ -523,24 +464,41 @@ def get_stopping_strings(state):
renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2'])
renderers.append(renderer)
- for renderer in renderers:
- prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False)
- prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True)
+ fake_messages = [
+ {"role": "user", "content": "first user message"},
+ {"role": "assistant", "content": "first assistant message"},
+ {"role": "user", "content": "second user message"},
+ {"role": "assistant", "content": "second assistant message"},
+ ]
- stopping_strings += [
- suffix_user + prefix_bot,
- suffix_user + prefix_user,
- suffix_bot + prefix_bot,
- suffix_bot + prefix_user,
+ stopping_strings = []
+ for renderer in renderers:
+ prompt = renderer(messages=fake_messages)
+
+ # Find positions of each message content
+ first_user_end = prompt.find("first user message") + len("first user message")
+ first_assistant_start = prompt.find("first assistant message")
+ first_assistant_end = prompt.find("first assistant message") + len("first assistant message")
+ second_user_start = prompt.find("second user message")
+ second_assistant_end = prompt.find("second assistant message") + len("second assistant message")
+
+ # Extract pieces of text potentially containing unique stopping strings
+ texts = [
+ prompt[first_user_end:first_assistant_start],
+ prompt[first_assistant_end:second_user_start],
+ prompt[second_assistant_end:]
]
- # Try to find the EOT token
- for item in stopping_strings.copy():
- item = item.strip()
- if item.startswith("<") and ">" in item:
- stopping_strings.append(item.split(">")[0] + ">")
- elif item.startswith("[") and "]" in item:
- stopping_strings.append(item.split("]")[0] + "]")
+ for text in texts:
+ text = text.strip()
+ if text.startswith("<") and ">" in text:
+ stopping_strings.append(text.split(">")[0] + ">")
+ elif text.startswith("[") and "]" in text:
+ stopping_strings.append(text.split("]")[0] + "]")
+ elif text.startswith("(") and ")" in text:
+ stopping_strings.append(text.split(")")[0] + ")")
+ elif text.startswith("{") and "}" in text:
+ stopping_strings.append(text.split("}")[0] + "}")
if 'stopping_strings' in state and isinstance(state['stopping_strings'], list):
stopping_strings += state.pop('stopping_strings')
@@ -549,12 +507,6 @@ def get_stopping_strings(state):
result = [item for item in stopping_strings if not any(item.startswith(other) and item != other for other in stopping_strings)]
result = list(set(result))
- # Handle GPT-OSS as a special case
- if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
- result.remove("<|end|>")
- result.append("<|result|>")
- result = list(set(result))
-
if shared.args.verbose:
logger.info("STOPPING_STRINGS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(result)
From aad0104c1b536d62c19e59f4afc5a90c703f169f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 17:33:13 -0700
Subject: [PATCH 10/58] Remove a function
---
modules/chat.py | 38 --------------------------------------
1 file changed, 38 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 023f5a3e..05ed02bf 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -86,44 +86,6 @@ yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
-def get_thinking_suppression_string(template):
- """
- Determines what string needs to be added to suppress thinking mode
- by comparing template renderings with thinking enabled vs disabled.
- """
-
- # Render with thinking enabled
- with_thinking = template.render(
- messages=[{'role': 'user', 'content': ''}],
- builtin_tools=None,
- tools=None,
- tools_in_user_message=False,
- add_generation_prompt=True,
- enable_thinking=True
- )
-
- # Render with thinking disabled
- without_thinking = template.render(
- messages=[{'role': 'user', 'content': ''}],
- builtin_tools=None,
- tools=None,
- tools_in_user_message=False,
- add_generation_prompt=True,
- enable_thinking=False
- )
-
- # Find the difference (what gets added to suppress thinking)
- i = 0
- while i < min(len(with_thinking), len(without_thinking)) and with_thinking[i] == without_thinking[i]:
- i += 1
-
- j = 0
- while j < min(len(with_thinking), len(without_thinking)) - i and with_thinking[-1 - j] == without_thinking[-1 - j]:
- j += 1
-
- return without_thinking[i:len(without_thinking) - j if j else None]
-
-
def generate_chat_prompt(user_input, state, **kwargs):
impersonate = kwargs.get('impersonate', False)
_continue = kwargs.get('_continue', False)
From adeca8a65888f97b94dfdaff6b2492c031ec1ccd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 17:36:01 -0700
Subject: [PATCH 11/58] Remove changes to the jinja2 templates
---
modules/models_settings.py | 4 ----
1 file changed, 4 deletions(-)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index aa16fdb9..7645880f 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -94,8 +94,6 @@ def get_model_metadata(model):
template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
template = re.sub(r'raise_exception\([^)]*\)', "''", template)
- template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
- template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS
model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
model_settings['instruction_template_str'] = template
@@ -172,8 +170,6 @@ def get_model_metadata(model):
template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
template = re.sub(r'raise_exception\([^)]*\)', "''", template)
- template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
- template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS
model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
model_settings['instruction_template_str'] = template
From 3ad59703748dcd5685dcbb7368df45914661e8da Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 17:43:21 -0700
Subject: [PATCH 12/58] Make the llama.cpp --verbose output less verbose
---
modules/llama_cpp_server.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index e3dd43b4..8579f843 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -193,7 +193,7 @@ class LlamaServer:
if shared.args.verbose:
logger.info("GENERATE_PARAMS=")
- printable_payload = {k: (v if k != "prompt" else "[multimodal object]" if pil_images else v) for k, v in payload.items()}
+ printable_payload = {k: v for k, v in payload.items() if k != "prompt"}
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print()
From b330ec35174f6b1b7e26922bdeef16069441bd8a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 17:54:15 -0700
Subject: [PATCH 13/58] Simplifications
---
modules/chat.py | 26 +++++++++++---------------
1 file changed, 11 insertions(+), 15 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 05ed02bf..530e3a0a 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -108,6 +108,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
tools=state['tools'] if 'tools' in state else None,
tools_in_user_message=False,
add_generation_prompt=False,
+ enable_thinking=state['enable_thinking'],
reasoning_effort=state['reasoning_effort']
)
@@ -262,10 +263,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
messages.append({"role": "user", "content": user_input})
def make_prompt(messages):
- if state['mode'] == 'chat-instruct' and _continue:
- prompt = renderer(messages=messages[:-1])
- else:
- prompt = renderer(messages=messages)
+ prompt = renderer(
+ messages=messages[:-1] if _continue else messages,
+ add_generation_prompt=(state['mode'] != 'chat-instruct')
+ )
if state['mode'] == 'chat-instruct':
command = state['chat-instruct_command']
@@ -287,20 +288,15 @@ def generate_chat_prompt(user_input, state, **kwargs):
# Handle GPT-OSS as a special case when continuing
# (otherwise the thinking block gets removed...)
if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']:
- last_message_to_continue = messages[-1]
- prompt = renderer(messages=messages[:-1])
-
assistant_reply_so_far = "<|start|>assistant"
- if 'thinking' in last_message_to_continue:
- assistant_reply_so_far += f"<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>"
+ if 'thinking' in messages[-1]:
+ assistant_reply_so_far += f"<|channel|>analysis<|message|>{messages[-1]['thinking']}<|end|>"
- assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}"
+ assistant_reply_so_far += f"<|channel|>final<|message|>"
prompt += assistant_reply_so_far
- else:
- prompt = renderer(
- messages=messages,
- add_generation_prompt=True
- )
+
+ if _continue:
+ prompt += messages[-1].get('content', '')
return prompt
From c1aa4590ea3d69ba9ae8edd3bf222af27a3cd13b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 18:05:40 -0700
Subject: [PATCH 14/58] Code simplifications, fix impersonate
---
modules/chat.py | 60 ++++++++++++++++++++++++++++---------------------
1 file changed, 34 insertions(+), 26 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 530e3a0a..7c2ab4a3 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -228,44 +228,48 @@ def generate_chat_prompt(user_input, state, **kwargs):
user_input = user_input.strip()
# Check if we have attachments
- has_attachments = False
- if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
- current_row_idx = len(history)
- user_key = f"user_{current_row_idx}"
- has_attachments = user_key in metadata and "attachments" in metadata[user_key]
-
- if (user_input or has_attachments) and not impersonate and not _continue:
- # For the current user input being processed, check if we need to add attachments
+ if not (impersonate or _continue):
+ has_attachments = False
if len(history_data.get('metadata', {})) > 0:
current_row_idx = len(history)
user_key = f"user_{current_row_idx}"
+ has_attachments = user_key in metadata and "attachments" in metadata[user_key]
- if user_key in metadata and "attachments" in metadata[user_key]:
- attachments_text = ""
- image_refs = ""
+ if user_input or has_attachments:
+ # For the current user input being processed, check if we need to add attachments
+ if len(history_data.get('metadata', {})) > 0:
+ current_row_idx = len(history)
+ user_key = f"user_{current_row_idx}"
- for attachment in metadata[user_key]["attachments"]:
- if attachment.get("type") == "image":
- image_refs += "<__media__>"
- else:
- filename = attachment.get("name", "file")
- content = attachment.get("content", "")
- if attachment.get("type") == "text/html" and attachment.get("url"):
- attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
+ if user_key in metadata and "attachments" in metadata[user_key]:
+ attachments_text = ""
+ image_refs = ""
+
+ for attachment in metadata[user_key]["attachments"]:
+ if attachment.get("type") == "image":
+ image_refs += "<__media__>"
else:
- attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+ filename = attachment.get("name", "file")
+ content = attachment.get("content", "")
+ if attachment.get("type") == "text/html" and attachment.get("url"):
+ attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
+ else:
+ attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
- if image_refs:
- user_input = f"{image_refs}\n\n{user_input}"
- if attachments_text:
- user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
+ if image_refs:
+ user_input = f"{image_refs}\n\n{user_input}"
+ if attachments_text:
+ user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
- messages.append({"role": "user", "content": user_input})
+ messages.append({"role": "user", "content": user_input})
+
+ if impersonate:
+ messages.append({"role": "user", "content": "fake user message replace me"})
def make_prompt(messages):
prompt = renderer(
messages=messages[:-1] if _continue else messages,
- add_generation_prompt=(state['mode'] != 'chat-instruct')
+ add_generation_prompt=(state['mode'] != 'chat-instruct' and not impersonate)
)
if state['mode'] == 'chat-instruct':
@@ -298,6 +302,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
if _continue:
prompt += messages[-1].get('content', '')
+ if impersonate:
+ prompt = prompt.split("fake user message replace me", 1)[0]
+ prompt += user_input
+
return prompt
prompt = make_prompt(messages)
From ded6c41cf8b5a95441516515f669766407d9692d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 18:16:17 -0700
Subject: [PATCH 15/58] Fix impersonate for chat-instruct
---
modules/chat.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/chat.py b/modules/chat.py
index 7c2ab4a3..d2513e07 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -263,7 +263,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
messages.append({"role": "user", "content": user_input})
- if impersonate:
+ if impersonate and state['mode'] != 'chat-instruct':
messages.append({"role": "user", "content": "fake user message replace me"})
def make_prompt(messages):
From b657be73814329d9d8d81f1cec49fe7c738dc3ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 18:22:08 -0700
Subject: [PATCH 16/58] Obtain stopping strings in chat mode
---
modules/chat.py | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index d2513e07..8a9a5a1b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -456,15 +456,17 @@ def get_stopping_strings(state):
]
for text in texts:
- text = text.strip()
- if text.startswith("<") and ">" in text:
- stopping_strings.append(text.split(">")[0] + ">")
- elif text.startswith("[") and "]" in text:
- stopping_strings.append(text.split("]")[0] + "]")
- elif text.startswith("(") and ")" in text:
- stopping_strings.append(text.split(")")[0] + ")")
- elif text.startswith("{") and "}" in text:
- stopping_strings.append(text.split("}")[0] + "}")
+ stripped_text = text.strip()
+ if stripped_text.startswith("<") and ">" in stripped_text:
+ stopping_strings.append(stripped_text.split(">")[0] + ">")
+ elif stripped_text.startswith("[") and "]" in stripped_text:
+ stopping_strings.append(stripped_text.split("]")[0] + "]")
+ elif stripped_text.startswith("(") and ")" in stripped_text:
+ stopping_strings.append(stripped_text.split(")")[0] + ")")
+ elif stripped_text.startswith("{") and "}" in stripped_text:
+ stopping_strings.append(stripped_text.split("}")[0] + "}")
+ elif ":" in text:
+ stopping_strings.append(text.split(":")[0] + ":")
if 'stopping_strings' in state and isinstance(state['stopping_strings'], list):
stopping_strings += state.pop('stopping_strings')
From 6c165d2e55f41f6e0259e2175b9c7314b28a221f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 18:28:43 -0700
Subject: [PATCH 17/58] Fix the chat template
---
modules/shared.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/shared.py b/modules/shared.py
index 644261a0..c3d96b70 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -287,7 +287,7 @@ settings = {
'greeting': 'How can I help you today?',
'custom_system_message': '',
'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}",
- 'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}",
+ 'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n {{- name2 + ': ' -}}\n{%- endif %}",
# Extensions
'default_extensions': [],
From a531328f7eef0d7e6f4ac85186409ee2320586ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 18:41:58 -0700
Subject: [PATCH 18/58] Fix the GPT-OSS stopping string
---
modules/chat.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/modules/chat.py b/modules/chat.py
index 8a9a5a1b..96d36ba5 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -475,6 +475,12 @@ def get_stopping_strings(state):
result = [item for item in stopping_strings if not any(item.startswith(other) and item != other for other in stopping_strings)]
result = list(set(result))
+ # Handle GPT-OSS as a special case
+ if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
+ result.remove("<|end|>")
+ result.append("<|result|>")
+ result = list(set(result))
+
if shared.args.verbose:
logger.info("STOPPING_STRINGS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(result)
From 8f660aefe361d396847ebce03d86f0e501561c17 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 18:50:16 -0700
Subject: [PATCH 19/58] Fix chat-instruct replies leaking the bot name
sometimes
---
modules/chat.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/modules/chat.py b/modules/chat.py
index 96d36ba5..a24a5be1 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -825,6 +825,12 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
# Extract the reply
if state['mode'] in ['chat', 'chat-instruct']:
+ reply = reply.lstrip()
+ if reply.startswith(state['name2'] + ':'):
+ reply = reply[len(state['name2'] + ':'):]
+ elif reply.startswith(state['name1'] + ':'):
+ reply = reply[len(state['name1'] + ':'):]
+
visible_reply = re.sub("(||{{user}})", state['name1'], reply)
else:
visible_reply = reply
From 8fcb4b310242a37b58bd006c28b7bb29a688e767 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 19:10:46 -0700
Subject: [PATCH 20/58] Make bot_prefix extensions functional again
---
modules/chat.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/modules/chat.py b/modules/chat.py
index a24a5be1..5eb9f301 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -306,6 +306,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
prompt = prompt.split("fake user message replace me", 1)[0]
prompt += user_input
+ if state['mode'] in ['chat', 'chat-instruct'] and not impersonate and not _continue:
+ prompt += apply_extensions('bot_prefix', "", state)
+
return prompt
prompt = make_prompt(messages)
From 6a7166fffaa4361a923cf9d6a15a2f6b96a8be6d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 19:46:48 -0700
Subject: [PATCH 21/58] Add support for the Seed-OSS template
---
modules/chat.py | 29 ++++++++++++++++++++++++++++-
modules/html_generator.py | 28 ++++++++++++++++++++++++++--
2 files changed, 54 insertions(+), 3 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 5eb9f301..818d1014 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -109,7 +109,8 @@ def generate_chat_prompt(user_input, state, **kwargs):
tools_in_user_message=False,
add_generation_prompt=False,
enable_thinking=state['enable_thinking'],
- reasoning_effort=state['reasoning_effort']
+ reasoning_effort=state['reasoning_effort'],
+ thinking_budget=-1 if state.get('enable_thinking', True) else 0
)
chat_renderer = partial(
@@ -190,6 +191,30 @@ def generate_chat_prompt(user_input, state, **kwargs):
messages.insert(insert_pos, msg_dict)
+ # Handle Seed-OSS
+ elif '' in assistant_msg:
+ thinking_content = ""
+ final_content = assistant_msg
+
+ # Extract thinking content if present
+ if '' in assistant_msg:
+ parts = assistant_msg.split('', 1)
+ if len(parts) > 1:
+ potential_content = parts[1]
+ if '' in potential_content:
+ thinking_content = potential_content.split('', 1)[0].strip()
+ final_content = parts[0] + potential_content.split('', 1)[1]
+ else:
+ thinking_content = potential_content.strip()
+ final_content = parts[0]
+
+ # Insert as structured message
+ msg_dict = {"role": "assistant", "content": final_content.strip()}
+ if thinking_content:
+ msg_dict["reasoning_content"] = thinking_content
+
+ messages.insert(insert_pos, msg_dict)
+
else:
# Default case (used by all other models)
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
@@ -687,6 +712,8 @@ def generate_search_query(user_message, state):
query = query.rsplit("", 1)[1]
elif "<|start|>assistant<|channel|>final<|message|>" in query:
query = query.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1]
+ elif "" in query:
+ query = query.rsplit("", 1)[1]
# Strip and remove surrounding quotes if present
query = query.strip()
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 279f9ba6..63844f35 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -137,7 +137,7 @@ def extract_thinking_block(string):
remaining_content = string[content_start:]
return thinking_content, remaining_content
- # If think tags not found, try alternative format
+ # If think tags not found, try GPT-OSS alternative format
ALT_START = "<|channel|>analysis<|message|>"
ALT_END = "<|end|>"
ALT_CONTENT_START = "<|start|>assistant<|channel|>final<|message|>"
@@ -168,7 +168,31 @@ def extract_thinking_block(string):
remaining_content = string[content_start:]
return thinking_content, remaining_content
- # Return if neither format is found
+ # Try seed:think format
+ SEED_START = "<seed:think>"
+ SEED_END = "</seed:think>"
+
+ seed_start_pos = string.find(SEED_START)
+ seed_end_pos = string.find(SEED_END)
+
+ if seed_start_pos != -1 or seed_end_pos != -1:
+ if seed_start_pos == -1:
+ thought_start = 0
+ else:
+ thought_start = seed_start_pos + len(SEED_START)
+
+ if seed_end_pos == -1:
+ thought_end = len(string)
+ content_start = len(string)
+ else:
+ thought_end = seed_end_pos
+ content_start = seed_end_pos + len(SEED_END)
+
+ thinking_content = string[thought_start:thought_end]
+ remaining_content = string[content_start:]
+ return thinking_content, remaining_content
+
+ # Return if no format is found
return None, string
From 02ca96fa44e0a29eb52aad46fafd0995e1d91d42 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 25 Aug 2025 22:17:22 -0700
Subject: [PATCH 22/58] Multiple fixes
---
modules/chat.py | 38 +++++++++++++++++++++++---------------
modules/shared.py | 4 ++--
2 files changed, 25 insertions(+), 17 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 818d1014..3c61a0dd 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -292,9 +292,22 @@ def generate_chat_prompt(user_input, state, **kwargs):
messages.append({"role": "user", "content": "fake user message replace me"})
def make_prompt(messages):
+ last_message = messages[-1].copy()
+ if _continue:
+ if state['mode'] == 'chat-instruct':
+ messages = messages[:-1]
+ else:
+ messages[-1]["content"] = "fake assistant message replace me"
+ messages.append({"role": "assistant", "content": "this will get deleted"})
+
+ if state['mode'] != 'chat-instruct':
+ add_generation_prompt = (not _continue and not impersonate)
+ else:
+ add_generation_prompt = False
+
prompt = renderer(
- messages=messages[:-1] if _continue else messages,
- add_generation_prompt=(state['mode'] != 'chat-instruct' and not impersonate)
+ messages=messages,
+ add_generation_prompt=add_generation_prompt
)
if state['mode'] == 'chat-instruct':
@@ -308,24 +321,19 @@ def generate_chat_prompt(user_input, state, **kwargs):
outer_messages.append({"role": "system", "content": state['custom_system_message']})
outer_messages.append({"role": "user", "content": command})
+ if _continue:
+ outer_messages.append(last_message.copy())
+ outer_messages[-1]["content"] = "fake assistant message replace me"
+ outer_messages.append({"role": "assistant", "content": "this will get deleted"})
prompt = instruct_renderer(
messages=outer_messages,
- add_generation_prompt=True
+ add_generation_prompt=not _continue
)
- else:
- # Handle GPT-OSS as a special case when continuing
- # (otherwise the thinking block gets removed...)
- if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']:
- assistant_reply_so_far = "<|start|>assistant"
- if 'thinking' in messages[-1]:
- assistant_reply_so_far += f"<|channel|>analysis<|message|>{messages[-1]['thinking']}<|end|>"
-
- assistant_reply_so_far += f"<|channel|>final<|message|>"
- prompt += assistant_reply_so_far
if _continue:
- prompt += messages[-1].get('content', '')
+ prompt = prompt.split("fake assistant message replace me", 1)[0]
+ prompt += last_message.get("content", "")
if impersonate:
prompt = prompt.split("fake user message replace me", 1)[0]
@@ -453,7 +461,7 @@ def get_stopping_strings(state):
renderer = partial(template.render, add_generation_prompt=False)
renderers.append(renderer)
- if state['mode'] in ['chat', 'chat-instruct']:
+ if state['mode'] in ['chat']:
template = jinja_env.from_string(state['chat_template_str'])
renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2'])
renderers.append(renderer)
diff --git a/modules/shared.py b/modules/shared.py
index c3d96b70..3e72acca 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -203,7 +203,7 @@ settings = {
'start_with': '',
'mode': 'instruct',
'chat_style': 'cai-chat',
- 'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
+ 'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>". Reply directly, without starting the reply with the character name.\n\n<|prompt|>',
'enable_web_search': False,
'web_search_pages': 3,
'prompt-notebook': '',
@@ -287,7 +287,7 @@ settings = {
'greeting': 'How can I help you today?',
'custom_system_message': '',
'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}",
- 'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n {{- name2 + ': ' -}}\n{%- endif %}",
+ 'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n {{- name2 + ':' -}}\n{%- endif %}",
# Extensions
'default_extensions': [],
From 750adf793dcf1bc4c5140c84f76b932dc454c194 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 26 Aug 2025 11:58:49 -0700
Subject: [PATCH 23/58] UI: Preserve chat scroll position on textarea resize
---
js/main.js | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
diff --git a/js/main.js b/js/main.js
index 4b4b14c2..4ada64f6 100644
--- a/js/main.js
+++ b/js/main.js
@@ -1065,3 +1065,57 @@ document.fonts.addEventListener("loadingdone", (event) => {
}
}, 50);
});
+
+//------------------------------------------------
+// Preserve chat scroll position on textarea resize
+//------------------------------------------------
+(function() {
+ let chatParent = null;
+ let initialState = null;
+ let debounceTimeout = null;
+
+ function getChatParent() {
+ if (!chatParent) chatParent = document.querySelector(".chat-parent");
+ return chatParent;
+ }
+
+ function getTextarea() {
+ return document.querySelector("#chat-input textarea");
+ }
+
+ document.addEventListener("input", function(e) {
+ if (e.target.matches("#chat-input textarea")) {
+ const chat = getChatParent();
+ const textarea = getTextarea();
+
+ if (chat && textarea) {
+ // Capture initial state only on first input of a typing sequence
+ if (!initialState) {
+ initialState = {
+ scrollTop: chat.scrollTop,
+ textareaHeight: textarea.offsetHeight
+ };
+ }
+
+ // Clear existing timeout
+ clearTimeout(debounceTimeout);
+
+ // Wait for typing to stop (50ms delay)
+ debounceTimeout = setTimeout(() => {
+ const finalTextareaHeight = textarea.offsetHeight;
+ const totalGrowth = finalTextareaHeight - initialState.textareaHeight;
+ const targetScroll = initialState.scrollTop + totalGrowth;
+
+ const restore = () => { chat.scrollTop = targetScroll; };
+
+ restore();
+ requestAnimationFrame(restore);
+ setTimeout(restore, 0);
+ setTimeout(restore, 10);
+
+ initialState = null;
+ }, 50);
+ }
+ }
+ }, true);
+})();
From ccc8a2229dd82ae8d77274341785e043f6f3a343 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 26 Aug 2025 13:59:54 -0700
Subject: [PATCH 24/58] Revert "UI: Preserve chat scroll position on textarea
resize"
This reverts commit 750adf793dcf1bc4c5140c84f76b932dc454c194.
---
js/main.js | 54 ------------------------------------------------------
1 file changed, 54 deletions(-)
diff --git a/js/main.js b/js/main.js
index 4ada64f6..4b4b14c2 100644
--- a/js/main.js
+++ b/js/main.js
@@ -1065,57 +1065,3 @@ document.fonts.addEventListener("loadingdone", (event) => {
}
}, 50);
});
-
-//------------------------------------------------
-// Preserve chat scroll position on textarea resize
-//------------------------------------------------
-(function() {
- let chatParent = null;
- let initialState = null;
- let debounceTimeout = null;
-
- function getChatParent() {
- if (!chatParent) chatParent = document.querySelector(".chat-parent");
- return chatParent;
- }
-
- function getTextarea() {
- return document.querySelector("#chat-input textarea");
- }
-
- document.addEventListener("input", function(e) {
- if (e.target.matches("#chat-input textarea")) {
- const chat = getChatParent();
- const textarea = getTextarea();
-
- if (chat && textarea) {
- // Capture initial state only on first input of a typing sequence
- if (!initialState) {
- initialState = {
- scrollTop: chat.scrollTop,
- textareaHeight: textarea.offsetHeight
- };
- }
-
- // Clear existing timeout
- clearTimeout(debounceTimeout);
-
- // Wait for typing to stop (50ms delay)
- debounceTimeout = setTimeout(() => {
- const finalTextareaHeight = textarea.offsetHeight;
- const totalGrowth = finalTextareaHeight - initialState.textareaHeight;
- const targetScroll = initialState.scrollTop + totalGrowth;
-
- const restore = () => { chat.scrollTop = targetScroll; };
-
- restore();
- requestAnimationFrame(restore);
- setTimeout(restore, 0);
- setTimeout(restore, 10);
-
- initialState = null;
- }, 50);
- }
- }
- }, true);
-})();
From 8042f76399f9aa84c4d16dd10fb9ddf3d01238a4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 27 Aug 2025 05:37:01 -0700
Subject: [PATCH 25/58] Make portable installs functional with Python 3.13
---
requirements/full/requirements.txt | 1 +
requirements/full/requirements_amd.txt | 1 +
requirements/full/requirements_amd_noavx2.txt | 1 +
requirements/full/requirements_apple_intel.txt | 1 +
requirements/full/requirements_apple_silicon.txt | 1 +
requirements/full/requirements_cpu_only.txt | 1 +
requirements/full/requirements_cpu_only_noavx2.txt | 1 +
requirements/full/requirements_noavx2.txt | 1 +
requirements/full/requirements_nowheels.txt | 1 +
requirements/portable/requirements.txt | 1 +
requirements/portable/requirements_apple_intel.txt | 1 +
requirements/portable/requirements_apple_silicon.txt | 1 +
requirements/portable/requirements_cpu_only.txt | 1 +
requirements/portable/requirements_cpu_only_noavx2.txt | 1 +
requirements/portable/requirements_noavx2.txt | 1 +
requirements/portable/requirements_nowheels.txt | 1 +
requirements/portable/requirements_vulkan.txt | 1 +
requirements/portable/requirements_vulkan_noavx2.txt | 1 +
18 files changed, 18 insertions(+)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index c53e7722..77ddc8fb 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -1,4 +1,5 @@
accelerate==1.8.*
+audioop-lts<1.0; python_version >= "3.13"
bitsandbytes==0.46.*
colorama
datasets
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 776dbc7b..802f6724 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -1,4 +1,5 @@
accelerate==1.8.*
+audioop-lts<1.0; python_version >= "3.13"
colorama
datasets
einops
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 7205a72e..bbb4fa59 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -1,4 +1,5 @@
accelerate==1.8.*
+audioop-lts<1.0; python_version >= "3.13"
colorama
datasets
einops
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index f4e00c5b..b721bcce 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -1,4 +1,5 @@
accelerate==1.8.*
+audioop-lts<1.0; python_version >= "3.13"
colorama
datasets
einops
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 0b28ad21..80b168d2 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -1,4 +1,5 @@
accelerate==1.8.*
+audioop-lts<1.0; python_version >= "3.13"
colorama
datasets
einops
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index fcbb48f0..5bfcdea6 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -1,4 +1,5 @@
accelerate==1.8.*
+audioop-lts<1.0; python_version >= "3.13"
colorama
datasets
einops
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 1705791e..31743a21 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -1,4 +1,5 @@
accelerate==1.8.*
+audioop-lts<1.0; python_version >= "3.13"
colorama
datasets
einops
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 7e951219..0d04d229 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -1,4 +1,5 @@
accelerate==1.8.*
+audioop-lts<1.0; python_version >= "3.13"
bitsandbytes==0.46.*
colorama
datasets
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index cd85a744..74d86047 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -1,4 +1,5 @@
accelerate==1.8.*
+audioop-lts<1.0; python_version >= "3.13"
colorama
datasets
einops
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 9eb8b33c..ca0c4017 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -1,3 +1,4 @@
+audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 0a27f61b..b5a853ba 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -1,3 +1,4 @@
+audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 79674e5a..995f5f26 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -1,3 +1,4 @@
+audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 3ebb7d8b..3b5d9442 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -1,3 +1,4 @@
+audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 81b78fe6..4bc705ec 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -1,3 +1,4 @@
+audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 58dc529a..a4dc4de9 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -1,3 +1,4 @@
+audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index b7b73eff..be624bb1 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -1,3 +1,4 @@
+audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 5ad8ede1..4367e180 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -1,3 +1,4 @@
+audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 0adf6e48..2130efcc 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -1,3 +1,4 @@
+audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
From 73442a2b6d0f2de333c26cbdde862f3f7b84d8a8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 27 Aug 2025 05:43:13 -0700
Subject: [PATCH 26/58] UI: Better handle the chat input position with CSS
This also solves scrolling issues with the main chat content
when the height of the textarea increases.
---
css/chat_style-messenger.css | 2 ++
css/main.css | 4 ++++
js/main.js | 27 +++++++++++++++++++++++++++
3 files changed, 33 insertions(+)
diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css
index 583703c0..70fd6d4a 100644
--- a/css/chat_style-messenger.css
+++ b/css/chat_style-messenger.css
@@ -99,9 +99,11 @@
.message-body p em {
color: rgb(110 110 110) !important;
}
+
.editing-textarea {
width: max(30rem) !important;
}
+
.circle-you + .text .edit-control-button, .circle-you + .text .editing-textarea {
color: #000 !important;
}
diff --git a/css/main.css b/css/main.css
index 062d3eb2..b799f595 100644
--- a/css/main.css
+++ b/css/main.css
@@ -404,6 +404,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
flex: 1;
overflow: auto !important;
border-radius: 0 !important;
+ margin-bottom: 75px;
}
.chat-parent .prose {
@@ -626,6 +627,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
max-width: 54rem;
left: 50%;
transform: translateX(-50%);
+ position: absolute;
+ bottom: 0;
+ background: var(--body-background-fill);
}
@media print {
diff --git a/js/main.js b/js/main.js
index 4b4b14c2..9b9d685a 100644
--- a/js/main.js
+++ b/js/main.js
@@ -1065,3 +1065,30 @@ document.fonts.addEventListener("loadingdone", (event) => {
}
}, 50);
});
+
+(function() {
+ const chatParent = document.querySelector(".chat-parent");
+ const chatInputRow = document.querySelector("#chat-input-row");
+ const originalMarginBottom = 75;
+ let originalHeight = chatInputRow.offsetHeight;
+
+ function updateMargin() {
+ const currentHeight = chatInputRow.offsetHeight;
+ const heightDifference = currentHeight - originalHeight;
+ chatParent.style.marginBottom = `${originalMarginBottom + heightDifference}px`;
+ }
+
+ // Watch for changes that might affect height
+ const observer = new MutationObserver(updateMargin);
+ observer.observe(chatInputRow, {
+ childList: true,
+ subtree: true,
+ attributes: true
+ });
+
+ // Also listen for window resize
+ window.addEventListener("resize", updateMargin);
+
+ // Initial call to set the margin based on current state
+ updateMargin();
+})();
From 0b4518e61cfe7993017c54d09328dd364301128f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 27 Aug 2025 05:53:09 -0700
Subject: [PATCH 27/58] "Text generation web UI" -> "Text Generation Web UI"
---
README.md | 4 ++--
modules/shared.py | 2 +-
server.py | 6 +++---
3 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index 6b49cee0..d42697dd 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Text generation web UI
+# Text Generation Web UI
A Gradio web UI for Large Language Models.
@@ -238,7 +238,7 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [-
[--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY]
[--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
-Text generation web UI
+Text Generation Web UI
options:
-h, --help show this help message and exit
diff --git a/modules/shared.py b/modules/shared.py
index 3e72acca..a3085239 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -31,7 +31,7 @@ persistent_interface_state = {}
need_restart = False
# Parser copied from https://github.com/vladmandic/automatic
-parser = argparse.ArgumentParser(description="Text generation web UI", conflict_handler='resolve', add_help=True, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=55, indent_increment=2, width=200))
+parser = argparse.ArgumentParser(description="Text Generation Web UI", conflict_handler='resolve', add_help=True, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=55, indent_increment=2, width=200))
# Basic settings
group = parser.add_argument_group('Basic settings')
diff --git a/server.py b/server.py
index e6687a3c..52463a3c 100644
--- a/server.py
+++ b/server.py
@@ -70,7 +70,7 @@ from modules.utils import gradio
def signal_handler(sig, frame):
- logger.info("Received Ctrl+C. Shutting down Text generation web UI gracefully.")
+ logger.info("Received Ctrl+C. Shutting down Text Generation Web UI gracefully.")
# Explicitly stop LlamaServer to avoid __del__ cleanup issues during shutdown
if shared.model and shared.model.__class__.__name__ == 'LlamaServer':
@@ -87,7 +87,7 @@ signal.signal(signal.SIGINT, signal_handler)
def create_interface():
- title = 'Text generation web UI'
+ title = 'Text Generation Web UI'
# Password authentication
auth = []
@@ -230,7 +230,7 @@ def create_interface():
if __name__ == "__main__":
- logger.info("Starting Text generation web UI")
+ logger.info("Starting Text Generation Web UI")
do_cmd_flags_warnings()
# Load custom settings
From 030ba7bfeb0e7aed7c8a176e13cc64cd75489d23 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 27 Aug 2025 07:44:35 -0700
Subject: [PATCH 28/58] UI: Mention that Seed-OSS uses enable_thinking
---
modules/ui_chat.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 94c980bb..1dbac13b 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -81,7 +81,7 @@ def create_ui():
gr.HTML("")
shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
- shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.')
+ shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS, pre-2507 Qwen3.')
gr.HTML("")
From a92758a1444626167468f0b0552a642b1e9245a2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 27 Aug 2025 16:15:20 -0700
Subject: [PATCH 29/58] llama.cpp: Fix obtaining the maximum sequence length
for GPT-OSS
---
modules/models_settings.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 7645880f..6dc000b4 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -68,7 +68,7 @@ def get_model_metadata(model):
metadata = load_gguf_metadata_with_cache(model_file)
for k in metadata:
- if k.endswith('context_length'):
+ if k.endswith('.context_length'):
model_settings['ctx_size'] = min(metadata[k], 8192)
model_settings['truncation_length_info'] = metadata[k]
elif k.endswith('rope.freq_base'):
From ba6041251d200dfffaf6ea46dd492554a254b241 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 28 Aug 2025 06:20:00 -0700
Subject: [PATCH 30/58] UI: Minor change
---
modules/ui_chat.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 1dbac13b..31a7a4fc 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -81,7 +81,7 @@ def create_ui():
gr.HTML("")
shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
- shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS, pre-2507 Qwen3.')
+ shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.')
gr.HTML("")
From a336a8bbeb53136c40040be8d7e18e79eec034df Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 28 Aug 2025 08:26:40 -0700
Subject: [PATCH 31/58] UI: Fix italic and quote color in headings
---
css/html_instruct_style.css | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 3e5ebe67..22901c4d 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -13,7 +13,9 @@
line-height: 28px !important;
}
-.dark .chat .message-body :is(p, li, q, em, h1, h2, h3, h4, h5, h6) {
+.dark .chat .message-body :is(p,li,h1,h2,h3,h4,h5,h6),
+.dark .chat .message-body em:not(:is(h1,h2,h3,h4,h5,h6) em),
+.dark .chat .message-body q:not(:is(h1,h2,h3,h4,h5,h6) q) {
color: #d1d5db !important;
}
From cfc83745ec96ad963282620524f94b08776de5b6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 28 Aug 2025 08:34:48 -0700
Subject: [PATCH 32/58] UI: Improve right sidebar borders in light mode
---
css/main.css | 9 +++++++++
modules/ui_chat.py | 8 ++++----
2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/css/main.css b/css/main.css
index b799f595..a7ed2534 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1697,3 +1697,12 @@ button:focus {
#chat-input span {
display: none;
}
+
+.sidebar-vertical-separator {
+ margin: 0;
+ border-bottom: var(--input-border-width) solid var(--input-border-color);
+}
+
+.dark .sidebar-vertical-separator {
+ border-bottom: 1px solid rgba(255,255,255,0.1);
+}
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 31a7a4fc..1e8218a9 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -78,18 +78,18 @@ def create_ui():
with gr.Row():
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
- gr.HTML("")
+ gr.HTML("")
shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.')
- gr.HTML("")
+ gr.HTML("")
shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search')
with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)
- gr.HTML("")
+ gr.HTML("")
with gr.Row():
shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
@@ -100,7 +100,7 @@ def create_ui():
with gr.Row():
shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
- gr.HTML("")
+ gr.HTML("")
with gr.Row():
shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm')
From cb8780a4ce617b9a53d0ffb535f6b18b82b5f3bf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 28 Aug 2025 11:13:19 -0700
Subject: [PATCH 33/58] Safer check for is_multimodal when loading models
Avoids unrelated multimodal error when a model fails to load due
to lack of memory.
---
modules/models.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index cae88ac5..133131d7 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -57,7 +57,7 @@ def load_model(model_name, loader=None):
shared.settings['truncation_length'] = shared.args.ctx_size
shared.is_multimodal = False
- if loader.lower() in ('exllamav3', 'llama.cpp'):
+ if loader.lower() in ('exllamav3', 'llama.cpp') and hasattr(model, 'is_multimodal'):
shared.is_multimodal = model.is_multimodal()
logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
From d9eec31886246d5501b6502457c917fb46e9e748 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 28 Aug 2025 17:46:29 -0700
Subject: [PATCH 34/58] UI: Suppress "Attempted to select a non-interactive or
hidden tab" warnings
---
js/global_scope_js.js | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index ebed1f3d..89b51d67 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -372,3 +372,18 @@ observer.observe(document.documentElement, {
subtree: true,
attributeFilter: ["style"]
});
+
+//------------------------------------------------
+// Suppress "Attempted to select a non-interactive or hidden tab" warning
+//------------------------------------------------
+(function() {
+ const originalWarn = console.warn;
+
+ console.warn = function(...args) {
+ if (args[0] && args[0].includes("Attempted to select a non-interactive or hidden tab")) {
+ return;
+ }
+
+ originalWarn.apply(console, args);
+ };
+})();
From 272095547845a69a725c4564d63f40e50f47e563 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 28 Aug 2025 19:48:16 -0700
Subject: [PATCH 35/58] Fix a bug after
d9eec31886246d5501b6502457c917fb46e9e748
---
js/global_scope_js.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 89b51d67..d8de2b58 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -380,7 +380,7 @@ observer.observe(document.documentElement, {
const originalWarn = console.warn;
console.warn = function(...args) {
- if (args[0] && args[0].includes("Attempted to select a non-interactive or hidden tab")) {
+ if (args[0] && typeof args[0] === 'string' && args[0].includes("Attempted to select a non-interactive or hidden tab")) {
return;
}
From fc2eb48664bc8e29034904cac43e6c0bc89aa727 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 28 Aug 2025 20:19:03 -0700
Subject: [PATCH 36/58] Style fixes after
73442a2b6d0f2de333c26cbdde862f3f7b84d8a8
---
css/main.css | 12 ------------
js/show_controls.js | 12 ------------
2 files changed, 24 deletions(-)
diff --git a/css/main.css b/css/main.css
index a7ed2534..cde01aa4 100644
--- a/css/main.css
+++ b/css/main.css
@@ -429,10 +429,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
margin-left: 5px;
}
-.chat-parent.bigchat {
- flex: 1;
-}
-
.chat > .messages {
display: flex;
flex-direction: column;
@@ -832,10 +828,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
padding: 1rem;
}
-#chat-input-row.bigchat {
- padding-bottom: 1px !important;
-}
-
#chat-col {
height: 100dvh;
display: flex;
@@ -851,10 +843,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
}
-#chat-col.bigchat {
- padding-bottom: 15px !important;
-}
-
.message-body ol, .message-body ul {
margin-top: 0 !important;
margin-bottom: 1.25em !important;
diff --git a/js/show_controls.js b/js/show_controls.js
index f974d412..ff513395 100644
--- a/js/show_controls.js
+++ b/js/show_controls.js
@@ -20,12 +20,6 @@ function toggle_controls(value) {
extensions.style.display = "inherit";
}
- // Remove bigchat classes
- chatParent.classList.remove("bigchat");
- document.getElementById("chat-input-row").classList.remove("bigchat");
- document.getElementById("chat-col").classList.remove("bigchat");
- document.getElementById("chat-tab").style.paddingBottom = "";
-
let gallery_element = document.getElementById("gallery-extension");
if (gallery_element) {
gallery_element.style.display = "block";
@@ -47,11 +41,5 @@ function toggle_controls(value) {
if (extensions) {
extensions.style.display = "none";
}
-
- // Add bigchat classes
- chatParent.classList.add("bigchat");
- document.getElementById("chat-input-row").classList.add("bigchat");
- document.getElementById("chat-col").classList.add("bigchat");
- document.getElementById("chat-tab").style.paddingBottom = "0px";
}
}
From d78b7d0fad31c6b9bf89a89f748e1b00c27c5946 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 28 Aug 2025 20:22:07 -0700
Subject: [PATCH 37/58] Lint
---
js/global_scope_js.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index d8de2b58..4d8c1121 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -380,7 +380,7 @@ observer.observe(document.documentElement, {
const originalWarn = console.warn;
console.warn = function(...args) {
- if (args[0] && typeof args[0] === 'string' && args[0].includes("Attempted to select a non-interactive or hidden tab")) {
+ if (args[0] && typeof args[0] === "string" && args[0].includes("Attempted to select a non-interactive or hidden tab")) {
return;
}
From 084675cf75d08ce2d82fe440abbaf52e429eed3a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 29 Aug 2025 09:11:10 -0700
Subject: [PATCH 38/58] UI: Improve thinking blocks in chat-instruct mode
---
css/main.css | 1 +
1 file changed, 1 insertion(+)
diff --git a/css/main.css b/css/main.css
index cde01aa4..f2793372 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1354,6 +1354,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
cursor: pointer;
user-select: none;
font-size: 14px;
+ line-height: var(--line-sm);
color: rgb(0 0 0 / 70%);
transition: background-color 0.2s;
}
From a2b37adb265847c878d71107fca988851090b46f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 29 Aug 2025 09:25:44 -0700
Subject: [PATCH 39/58] UI: Preload the correct fonts for chat mode
---
modules/block_requests.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/modules/block_requests.py b/modules/block_requests.py
index 618b4bd6..911e41d9 100644
--- a/modules/block_requests.py
+++ b/modules/block_requests.py
@@ -53,8 +53,9 @@ def my_open(*args, **kwargs):
'',
'\n '
'\n '
- '\n '
- '\n '
+ '\n '
+ '\n '
+ '\n '
'\n '
'\n '
'\n '
From 07a2e226c165fd5917c778f18c0f0fd4bcef38b7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 29 Aug 2025 14:08:38 -0700
Subject: [PATCH 40/58] UI: Minor font color fixes in instruct mode
---
css/html_instruct_style.css | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 22901c4d..6dee0a89 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -14,8 +14,8 @@
}
.dark .chat .message-body :is(p,li,h1,h2,h3,h4,h5,h6),
-.dark .chat .message-body em:not(:is(h1,h2,h3,h4,h5,h6) em),
-.dark .chat .message-body q:not(:is(h1,h2,h3,h4,h5,h6) q) {
+.dark .chat .message-body em:not(:is(h1,h2,h3,h4,h5,h6,b,strong) em),
+.dark .chat .message-body q:not(:is(h1,h2,h3,h4,h5,h6,b,strong) q) {
color: #d1d5db !important;
}
From 08f90f4b64565424f812e9a0447338c022d883f2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 29 Aug 2025 14:09:04 -0700
Subject: [PATCH 41/58] Lint
---
css/main.css | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/css/main.css b/css/main.css
index f2793372..c7ee57da 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1693,5 +1693,5 @@ button:focus {
}
.dark .sidebar-vertical-separator {
- border-bottom: 1px solid rgba(255,255,255,0.1);
+ border-bottom: 1px solid rgb(255 255 255 / 10%);
}
From a3eb67e466f9cc23a6c3607842375e8bf50f2dd0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 08:42:26 -0700
Subject: [PATCH 42/58] Fix the UI failing to launch if the Notebook prompt is
too long
---
modules/prompts.py | 3 +--
modules/ui_default.py | 3 +--
modules/ui_notebook.py | 3 +--
server.py | 8 ++++++++
4 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/modules/prompts.py b/modules/prompts.py
index 79d9b56e..b800af91 100644
--- a/modules/prompts.py
+++ b/modules/prompts.py
@@ -22,8 +22,7 @@ def load_prompt(fname):
if file_path.exists():
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
- if len(text) > 0 and text[-1] == '\n':
- text = text[:-1]
+ text = text.rstrip()
return text
else:
diff --git a/modules/ui_default.py b/modules/ui_default.py
index 44af48a3..c0feae19 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -22,8 +22,7 @@ def create_ui():
with gr.Row():
with gr.Column():
with gr.Row():
- initial_text = load_prompt(shared.settings['prompt-notebook'])
- shared.gradio['textbox-default'] = gr.Textbox(value=initial_text, lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar'])
+ shared.gradio['textbox-default'] = gr.Textbox(value="", lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar'])
shared.gradio['token-counter-default'] = gr.HTML(value="0", elem_id="default-token-counter")
with gr.Row():
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index 939d81f7..9fab879b 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -30,8 +30,7 @@ def create_ui():
with gr.Column(scale=4):
with gr.Tab('Raw'):
with gr.Row():
- initial_text = load_prompt(shared.settings['prompt-notebook'])
- shared.gradio['textbox-notebook'] = gr.Textbox(label="", value=initial_text, lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar'])
+ shared.gradio['textbox-notebook'] = gr.Textbox(label="", value="", lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar'])
shared.gradio['token-counter-notebook'] = gr.HTML(value="0", elem_id="notebook-token-counter")
with gr.Tab('Markdown'):
diff --git a/server.py b/server.py
index 52463a3c..c804c342 100644
--- a/server.py
+++ b/server.py
@@ -6,6 +6,7 @@ from pathlib import Path
from modules import shared
from modules.block_requests import OpenMonkeyPatch, RequestBlocker
from modules.logging_colors import logger
+from modules.prompts import load_prompt
# Set up Gradio temp directory path
gradio_temp_path = Path('user_data') / 'cache' / 'gradio'
@@ -109,6 +110,13 @@ def create_interface():
'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp'
})
+ if shared.settings['prompt-notebook']:
+ prompt = load_prompt(shared.settings['prompt-notebook'])
+ shared.persistent_interface_state.update({
+ 'textbox-default': prompt,
+ 'textbox-notebook': prompt
+ })
+
# Clear existing cache files
for cache_file in ['pfp_character.png', 'pfp_character_thumb.png']:
cache_path = Path(f"user_data/cache/{cache_file}")
From 96136ea76008cf1fb440050936b6b5bca5bd3d85 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 10:13:32 -0700
Subject: [PATCH 43/58] Fix LaTeX rendering for equations with asterisks
---
modules/html_generator.py | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 63844f35..9f8c28e5 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -243,6 +243,27 @@ def process_markdown_content(string):
if not string:
return ""
+ # Define a unique placeholder for LaTeX asterisks
+ LATEX_ASTERISK_PLACEHOLDER = "LATEXASTERISKPLACEHOLDER"
+
+ def protect_asterisks_in_latex(match):
+ """A replacer function for re.sub to protect asterisks in multiple LaTeX formats."""
+ # Check which delimiter group was captured
+ if match.group(1) is not None: # Content from $$...$$
+ content = match.group(1)
+ modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
+ return f'$${modified_content}$$'
+ elif match.group(2) is not None: # Content from \[...\]
+ content = match.group(2)
+ modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
+ return f'\\[{modified_content}\\]'
+ elif match.group(3) is not None: # Content from \(...\)
+ content = match.group(3)
+ modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
+ return f'\\({modified_content}\\)'
+
+ return match.group(0) # Fallback
+
# Make \[ \] LaTeX equations inline
pattern = r'^\s*\\\[\s*\n([\s\S]*?)\n\s*\\\]\s*$'
replacement = r'\\[ \1 \\]'
@@ -272,6 +293,10 @@ def process_markdown_content(string):
string = string.replace('\\end{equation*}', '$$')
string = re.sub(r"(.)```", r"\1\n```", string)
+ # Protect asterisks within all LaTeX blocks before markdown conversion
+ latex_pattern = re.compile(r'\$\$(.*?)\$\$|\\\[(.*?)\\\]|\\\((.*?)\\\)', re.DOTALL)
+ string = latex_pattern.sub(protect_asterisks_in_latex, string)
+
result = ''
is_code = False
is_latex = False
@@ -330,6 +355,9 @@ def process_markdown_content(string):
# Convert to HTML using markdown
html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])
+ # Restore the LaTeX asterisks after markdown conversion
+ html_output = html_output.replace(LATEX_ASTERISK_PLACEHOLDER, '*')
+
# Remove extra newlines before
html_output = re.sub(r'\s*', '', html_output)
From cf1aad2a687622358e121497c30508e960267662 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 12:16:45 -0700
Subject: [PATCH 44/58] Fix "continue" for Byte-OSS for partial thinking blocks
---
modules/chat.py | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/modules/chat.py b/modules/chat.py
index 3c61a0dd..6d85bc6e 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -333,7 +333,23 @@ def generate_chat_prompt(user_input, state, **kwargs):
if _continue:
prompt = prompt.split("fake assistant message replace me", 1)[0]
- prompt += last_message.get("content", "")
+ content = last_message.get("content", "")
+ thinking = last_message.get("thinking", "")
+ reasoning = last_message.get("reasoning_content", "")
+
+ partial_thought = thinking or reasoning
+ # Handle partial thinking blocks (GPT-OSS and Seed-OSS)
+ if partial_thought and partial_thought.strip():
+ search_string = partial_thought.strip()
+ index = prompt.rfind(search_string)
+ if index != -1:
+ prompt = prompt[:index] + partial_thought
+ else:
+ # Fallback
+ prompt += content
+ else:
+ # All other cases
+ prompt += content
if impersonate:
prompt = prompt.split("fake user message replace me", 1)[0]
From 3a3e247f3cb21ec7c13bcfa9f21757e216d5c7ec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 12:36:35 -0700
Subject: [PATCH 45/58] Even better way to handle continue for thinking blocks
---
modules/chat.py | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 6d85bc6e..ad2f4001 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -333,20 +333,19 @@ def generate_chat_prompt(user_input, state, **kwargs):
if _continue:
prompt = prompt.split("fake assistant message replace me", 1)[0]
- content = last_message.get("content", "")
- thinking = last_message.get("thinking", "")
- reasoning = last_message.get("reasoning_content", "")
- partial_thought = thinking or reasoning
+ content = last_message.get("content", "")
+ partial_thought = last_message.get("thinking", "") or last_message.get("reasoning_content", "")
+
# Handle partial thinking blocks (GPT-OSS and Seed-OSS)
- if partial_thought and partial_thought.strip():
+ if not content and partial_thought and partial_thought.strip():
search_string = partial_thought.strip()
index = prompt.rfind(search_string)
if index != -1:
prompt = prompt[:index] + partial_thought
else:
- # Fallback
- prompt += content
+ # Fallback if search fails: just append the thought
+ prompt += partial_thought
else:
# All other cases
prompt += content
From 21d790f87ea21fc1f7f9f938621d805b21564493 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 14:48:07 -0700
Subject: [PATCH 46/58] Optimize LaTeX rendering during streaming for long
replies
---
js/main.js | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/js/main.js b/js/main.js
index 9b9d685a..c9ee6284 100644
--- a/js/main.js
+++ b/js/main.js
@@ -260,13 +260,19 @@ function doSyntaxHighlighting() {
codeBlock.classList.add("pretty_scrollbar");
});
- renderMathInElement(messageBody, {
- delimiters: [
- { left: "$$", right: "$$", display: true },
- { left: "$", right: "$", display: false },
- { left: "\\(", right: "\\)", display: false },
- { left: "\\[", right: "\\]", display: true },
- ],
+ // Only render math in visible elements
+ const mathContainers = messageBody.querySelectorAll("p, div, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
+ mathContainers.forEach(container => {
+ if (isElementVisibleOnScreen(container)) {
+ renderMathInElement(container, {
+ delimiters: [
+ { left: "$$", right: "$$", display: true },
+ { left: "$", right: "$", display: false },
+ { left: "\\(", right: "\\)", display: false },
+ { left: "\\[", right: "\\]", display: true },
+ ],
+ });
+ }
});
} else if (hasSeenVisible) {
// We've seen visible messages but this one is not visible
From 5920ad8834dd8d3077376636c8347e7547bb6a04 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 15:22:50 -0700
Subject: [PATCH 47/58] UI: Give streaming instruct messages more vertical
space
---
js/main.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/js/main.js b/js/main.js
index c9ee6284..7246ca87 100644
--- a/js/main.js
+++ b/js/main.js
@@ -206,7 +206,7 @@ const observer = new MutationObserver(function(mutations) {
// Add padding to the messages container to create room for the last message.
// The purpose of this is to avoid constant scrolling during streaming in
// instruct mode.
- const bufferHeight = Math.max(0, Math.max(0.7 * window.innerHeight, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight);
+ const bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight);
messagesContainer.style.paddingBottom = `${bufferHeight}px`;
}
}
From 5631d4e3d69a7d3e77899efcd71b8e4860cfa346 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 15:34:49 -0700
Subject: [PATCH 48/58] Minor change after
21d790f87ea21fc1f7f9f938621d805b21564493
---
js/main.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/js/main.js b/js/main.js
index 7246ca87..c08dffcf 100644
--- a/js/main.js
+++ b/js/main.js
@@ -261,7 +261,7 @@ function doSyntaxHighlighting() {
});
// Only render math in visible elements
- const mathContainers = messageBody.querySelectorAll("p, div, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
+ const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
mathContainers.forEach(container => {
if (isElementVisibleOnScreen(container)) {
renderMathInElement(container, {
From 7b80e9a2ad0cb12ac732bd2bcf7bc0bd1cb3a0e6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 20:22:11 -0700
Subject: [PATCH 49/58] Update llama.cpp
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_amd_noavx2.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 6 +++---
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 6 +++---
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/portable/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
requirements/portable/requirements_vulkan_noavx2.txt | 4 ++--
16 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 77ddc8fb..b9b7dcbe 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -35,8 +35,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 802f6724..fb0ee8f8 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index bbb4fa59..080615e3 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b721bcce..6b8181a7 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 80b168d2..5f44da75 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 5bfcdea6..aa82f50c 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -34,5 +34,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 31743a21..452ad801 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -34,5 +34,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 0d04d229..2bd992fc 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -35,8 +35,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index ca0c4017..c170d1e2 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index b5a853ba..2d71b660 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 995f5f26..3a9b79e1 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -19,6 +19,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 3b5d9442..202b726d 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 4bc705ec..3c1a14bd 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index a4dc4de9..6ba0abcc 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 4367e180..cd94fb8c 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 2130efcc..51727f2d 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From 13876a1ee8b0d40ab54f4683b02c4534543f8aa8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 20:27:32 -0700
Subject: [PATCH 50/58] llama.cpp: Remove the --flash-attn flag (it's always on
now)
---
modules/llama_cpp_server.py | 2 --
modules/loaders.py | 1 -
modules/shared.py | 4 ----
modules/ui.py | 1 -
modules/ui_model_menu.py | 1 -
5 files changed, 9 deletions(-)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 8579f843..6a094c9d 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -318,8 +318,6 @@ class LlamaServer:
"--no-webui",
]
- if shared.args.flash_attn:
- cmd.append("--flash-attn")
if shared.args.threads > 0:
cmd += ["--threads", str(shared.args.threads)]
if shared.args.threads_batch > 0:
diff --git a/modules/loaders.py b/modules/loaders.py
index f88e976d..fe982ab5 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -16,7 +16,6 @@ loaders_and_params = OrderedDict({
'streaming_llm',
'rope_freq_base',
'compress_pos_emb',
- 'flash_attn',
'row_split',
'no_kv_offload',
'no_mmap',
diff --git a/modules/shared.py b/modules/shared.py
index a3085239..4daf43c9 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -73,7 +73,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for
# llama.cpp
group = parser.add_argument_group('llama.cpp')
-group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.')
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
@@ -159,9 +158,6 @@ group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 f
group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API')
group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')
-# Deprecated parameters
-group = parser.add_argument_group('Deprecated')
-
# Handle CMD_FLAGS.txt
cmd_flags_path = Path(__file__).parent.parent / "user_data" / "CMD_FLAGS.txt"
if cmd_flags_path.exists():
diff --git a/modules/ui.py b/modules/ui.py
index 502005e7..12f43768 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -142,7 +142,6 @@ def list_model_elements():
'num_experts_per_token',
'load_in_8bit',
'load_in_4bit',
- 'flash_attn',
'attn_implementation',
'cpu',
'disk',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index dd240627..729700d4 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -50,7 +50,6 @@ def create_ui():
with gr.Column():
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
- shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
From 8028d8854122887616a5a5322704904fffa98a93 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 21:29:20 -0700
Subject: [PATCH 51/58] Lint
---
modules/models.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index 133131d7..d2b9cc98 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,6 +1,5 @@
import sys
import time
-from pathlib import Path
import modules.shared as shared
from modules.logging_colors import logger
From 387e249decfb8ca7e119e5971da11d3605e7e3e3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 30 Aug 2025 21:31:27 -0700
Subject: [PATCH 52/58] Change an info message
---
modules/ui_chat.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 1e8218a9..7c388607 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -92,7 +92,7 @@ def create_ui():
gr.HTML("")
with gr.Row():
- shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
+ shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='In instruct and chat-instruct modes, the template under Parameters > Instruction template is used.', elem_id='chat-mode')
with gr.Row():
shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
From 00ebb295d32cc89da239f62b68a15bb1ae4bc636 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 31 Aug 2025 16:27:23 -0700
Subject: [PATCH 53/58] Update llama.cpp
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_amd_noavx2.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 6 +++---
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 6 +++---
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/portable/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
requirements/portable/requirements_vulkan_noavx2.txt | 4 ++--
16 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index b9b7dcbe..9cf069c7 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -35,8 +35,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index fb0ee8f8..81434cc8 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 080615e3..bf547be7 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 6b8181a7..64dfcab7 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 5f44da75..cb29ad4e 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index aa82f50c..e0a10782 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -34,5 +34,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 452ad801..5b6a4bf4 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -34,5 +34,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 2bd992fc..39c6f768 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -35,8 +35,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index c170d1e2..8407fa29 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 2d71b660..3d2be6de 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 3a9b79e1..2bf635b3 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -19,6 +19,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 202b726d..3b9fc16f 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 3c1a14bd..e4d2900d 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 6ba0abcc..5b492b42 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index cd94fb8c..90e7d38b 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 51727f2d..fe21a1c7 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From d843afcf66afeea23c941d6418d63233d0702d2e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 2 Sep 2025 05:43:33 -0700
Subject: [PATCH 54/58] Update llama.cpp
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_amd_noavx2.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 6 +++---
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 6 +++---
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/portable/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
requirements/portable/requirements_vulkan_noavx2.txt | 4 ++--
16 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 9cf069c7..3a3b899c 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -35,8 +35,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 81434cc8..388da65c 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index bf547be7..d1635779 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 64dfcab7..dde8d4a1 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -34,7 +34,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index cb29ad4e..9b1776ca 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index e0a10782..17d907bc 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -34,5 +34,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 5b6a4bf4..8c095428 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -34,5 +34,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 39c6f768..553e8cfb 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -35,8 +35,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 8407fa29..e77ce7b1 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 3d2be6de..dc45ef37 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 2bf635b3..541f96d4 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -19,6 +19,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 3b9fc16f..2af3b4b9 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index e4d2900d..6a5f5740 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 5b492b42..a7f2405b 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 90e7d38b..bb2b0f28 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index fe21a1c7..404f1267 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From 00ed878b054df15311be871381b09a0a8ecd1135 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 2 Sep 2025 10:16:26 -0700
Subject: [PATCH 55/58] Slightly more robust model loading
---
modules/models.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/modules/models.py b/modules/models.py
index d2b9cc98..9535ea82 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -45,12 +45,13 @@ def load_model(model_name, loader=None):
model, tokenizer = output
else:
model = output
- if model is None:
- return None, None
- else:
+ if model is not None:
from modules.transformers_loader import load_tokenizer
tokenizer = load_tokenizer(model_name)
+ if model is None:
+ return None, None
+
shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp':
shared.settings['truncation_length'] = shared.args.ctx_size
From c6ea67bbdbb4b1f7de7d6f0a8d6909c54c62c348 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 2 Sep 2025 10:22:03 -0700
Subject: [PATCH 56/58] Lint
---
modules/html_generator.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 9f8c28e5..492b52bd 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -262,7 +262,7 @@ def process_markdown_content(string):
modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
return f'\\({modified_content}\\)'
- return match.group(0) # Fallback
+ return match.group(0) # Fallback
# Make \[ \] LaTeX equations inline
pattern = r'^\s*\\\[\s*\n([\s\S]*?)\n\s*\\\]\s*$'
From 2395c647d45769fe8c440d75219fb838a74869e3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:11:15 -0700
Subject: [PATCH 57/58] Fix the instruct message height on mobile
---
js/main.js | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/js/main.js b/js/main.js
index c08dffcf..c31621f6 100644
--- a/js/main.js
+++ b/js/main.js
@@ -206,7 +206,13 @@ const observer = new MutationObserver(function(mutations) {
// Add padding to the messages container to create room for the last message.
// The purpose of this is to avoid constant scrolling during streaming in
// instruct mode.
- const bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight);
+ let bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight);
+
+ // Subtract header height when screen width is <= 924px
+ if (window.innerWidth <= 924) {
+ bufferHeight = Math.max(0, bufferHeight - 32);
+ }
+
messagesContainer.style.paddingBottom = `${bufferHeight}px`;
}
}
From f3829b268a870c8113dc4146a13e5d9e07fd1aea Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:12:17 -0700
Subject: [PATCH 58/58] llama.cpp: Always pass --flash-attn on
---
modules/llama_cpp_server.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 6a094c9d..38589cf2 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -316,6 +316,7 @@ class LlamaServer:
"--batch-size", str(shared.args.batch_size),
"--port", str(self.port),
"--no-webui",
+ "--flash-attn", "on",
]
if shared.args.threads > 0: