From fd41f2fafcc6e286b69ba7efe2f5214d89f834ca Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 22 Aug 2025 11:18:56 -0700 Subject: [PATCH 01/58] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 9f906b26..f1d40000 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 70e031b8..437a10d9 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 81556326..0170d951 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 7b9d3650..7b369c40 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 0fc9162f..1d1f44e0 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 3565a994..e63e9705 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 64c17416..c03a718a 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 2b162308..70b73e83 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 943ea600..ab91a763 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 394b89b6..0faa6502 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index cffe3aea..e1024942 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index d274e2c8..3d2b6338 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 47ec086e..a95b30b3 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 9a0a3694..2eb7f597 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 45e96da9..3244c9d4 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 9183562e..685c7d1c 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From f247c2ae62fa246414bededf901df058665f819b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 22 Aug 2025 11:46:02 -0700 Subject: [PATCH 02/58] Make --model work with absolute paths, eg --model /tmp/gemma-3-270m-it-IQ4_NL.gguf --- modules/models.py | 12 ++++++++++-- modules/models_settings.py | 16 +++++++++------- modules/utils.py | 13 +++++++++++++ server.py | 13 +++---------- 4 files changed, 35 insertions(+), 19 deletions(-) diff --git a/modules/models.py b/modules/models.py index ca3d184f..cae88ac5 100644 --- a/modules/models.py +++ b/modules/models.py @@ -5,6 +5,7 @@ from pathlib import Path import modules.shared as shared from modules.logging_colors import logger from modules.models_settings import get_model_metadata +from modules.utils import resolve_model_path last_generation_time = time.time() @@ -69,17 +70,24 @@ def load_model(model_name, loader=None): def llama_cpp_server_loader(model_name): from modules.llama_cpp_server import LlamaServer - path = Path(f'{shared.args.model_dir}/{model_name}') + path = resolve_model_path(model_name) + if path.is_file(): model_file = path else: - model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] + gguf_files = sorted(path.glob('*.gguf')) + if not gguf_files: + logger.error(f"No .gguf models found in the directory: {path}") + return None, None + + model_file = gguf_files[0] try: model = LlamaServer(model_file) return model, model except Exception as e: logger.error(f"Error loading the model with llama.cpp: {str(e)}") + return None, None def transformers_loader(model_name): diff --git a/modules/models_settings.py b/modules/models_settings.py index c325fa0c..aa16fdb9 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -10,6 +10,7 @@ import yaml from modules import chat, loaders, metadata_gguf, shared, ui from modules.logging_colors import logger +from modules.utils import resolve_model_path def get_fallback_settings(): @@ -26,6 +27,7 @@ def get_fallback_settings(): def get_model_metadata(model): + model_path = resolve_model_path(model) model_settings = {} # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml @@ -35,7 +37,7 @@ def get_model_metadata(model): for k in settings[pat]: model_settings[k] = settings[pat][k] - path = Path(f'{shared.args.model_dir}/{model}/config.json') + path = model_path / 'config.json' if path.exists(): hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read()) else: @@ -51,7 +53,7 @@ def get_model_metadata(model): # GGUF metadata if model_settings['loader'] == 'llama.cpp': - path = Path(f'{shared.args.model_dir}/{model}') + path = model_path if path.is_file(): model_file = path else: @@ -130,18 +132,18 @@ def get_model_metadata(model): model_settings['bf16'] = True # Try to find the Jinja instruct template - path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json' + path = model_path / 'tokenizer_config.json' template = None # 1. Prioritize reading from chat_template.jinja if it exists - jinja_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.jinja' + jinja_path = model_path / 'chat_template.jinja' if jinja_path.exists(): with open(jinja_path, 'r', encoding='utf-8') as f: template = f.read() # 2. If no .jinja file, try chat_template.json if template is None: - json_template_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.json' + json_template_path = model_path / 'chat_template.json' if json_template_path.exists(): with open(json_template_path, 'r', encoding='utf-8') as f: json_data = json.load(f) @@ -201,7 +203,7 @@ def get_model_metadata(model): def infer_loader(model_name, model_settings, hf_quant_method=None): - path_to_model = Path(f'{shared.args.model_dir}/{model_name}') + path_to_model = resolve_model_path(model_name) if not path_to_model.exists(): loader = None elif shared.args.portable: @@ -357,7 +359,7 @@ def get_model_size_mb(model_file: Path) -> float: def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): - model_file = Path(f'{shared.args.model_dir}/{gguf_file}') + model_file = resolve_model_path(gguf_file) metadata = load_gguf_metadata_with_cache(model_file) size_in_mb = get_model_size_mb(model_file) diff --git a/modules/utils.py b/modules/utils.py index 4927ef04..e8d23a02 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -86,6 +86,19 @@ def check_model_loaded(): return True, None +def resolve_model_path(model_name_or_path): + """ + Resolves a model path, checking for a direct path + before the default models directory. + """ + + path_candidate = Path(model_name_or_path) + if path_candidate.exists(): + return path_candidate + else: + return Path(f'{shared.args.model_dir}/{model_name_or_path}') + + def get_available_models(): # Get all GGUF files gguf_files = get_available_ggufs() diff --git a/server.py b/server.py index 7ce3c208..e6687a3c 100644 --- a/server.py +++ b/server.py @@ -283,21 +283,14 @@ if __name__ == "__main__": # If any model has been selected, load it if shared.model_name != 'None': - p = Path(shared.model_name) - if p.exists(): - model_name = p.parts[-1] - shared.model_name = model_name - else: - model_name = shared.model_name - - model_settings = get_model_metadata(model_name) + model_settings = get_model_metadata(shared.model_name) update_model_parameters(model_settings, initial=True) # hijack the command-line arguments # Auto-adjust GPU layers if not provided by user and it's a llama.cpp model if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings: vram_usage, adjusted_layers = update_gpu_layers_and_vram( shared.args.loader, - model_name, + shared.model_name, model_settings['gpu_layers'], shared.args.ctx_size, shared.args.cache_type, @@ -308,7 +301,7 @@ if __name__ == "__main__": shared.args.gpu_layers = adjusted_layers # Load the model - shared.model, shared.tokenizer = load_model(model_name) + shared.model, shared.tokenizer = load_model(shared.model_name) if shared.args.lora: add_lora_to_model(shared.args.lora) From 7fe8da89448f795838a34adc0b2246855127e4a2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 22 Aug 2025 14:42:56 -0700 Subject: [PATCH 03/58] Minor simplification after f247c2ae62fa246414bededf901df058665f819b --- modules/llama_cpp_server.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 5953803a..8f1924cb 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -20,6 +20,7 @@ from modules.image_utils import ( convert_pil_to_base64 ) from modules.logging_colors import logger +from modules.utils import resolve_model_path llamacpp_valid_cache_types = {"fp16", "q8_0", "q4_0"} @@ -351,14 +352,12 @@ class LlamaServer: if path.exists(): cmd += ["--mmproj", str(path)] if shared.args.model_draft not in [None, 'None']: - path = Path(shared.args.model_draft) - if not path.exists(): - path = Path(f'{shared.args.model_dir}/{shared.args.model_draft}') + path = resolve_model_path(shared.args.model_draft) if path.is_file(): model_file = path else: - model_file = sorted(Path(f'{shared.args.model_dir}/{shared.args.model_draft}').glob('*.gguf'))[0] + model_file = sorted(path.glob('*.gguf'))[0] cmd += ["--model-draft", model_file] if shared.args.draft_max > 0: From 8be798e15f48bd1f498d2c609ddf2f31cf22524b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 24 Aug 2025 12:19:19 -0700 Subject: [PATCH 04/58] llama.cpp: Fix stderr deadlock while loading some multimodal models --- modules/llama_cpp_server.py | 70 ++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 8f1924cb..e3dd43b4 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -410,8 +410,7 @@ class LlamaServer: self.process = subprocess.Popen( cmd, stderr=subprocess.PIPE, - text=True, - bufsize=1, + bufsize=0, env=env ) @@ -473,34 +472,55 @@ def filter_stderr_with_progress(process_stderr): last_was_progress = False try: - for raw in iter(process_stderr.readline, ''): - line = raw.rstrip('\r\n') - match = progress_re.search(line) + # Read in binary mode and decode manually + buffer = b"" + while True: + # Read chunks aggressively to prevent buffer overflow + chunk = process_stderr.read(4096) + if not chunk: + break - if match: - progress = float(match.group(1)) + buffer += chunk - # Extract just the part from "prompt processing" onwards - prompt_processing_idx = line.find('prompt processing') - if prompt_processing_idx != -1: - display_line = line[prompt_processing_idx:] - else: - display_line = line # fallback to full line + # Process complete lines + while b'\n' in buffer: + line_bytes, buffer = buffer.split(b'\n', 1) + try: + line = line_bytes.decode('utf-8', errors='replace').strip('\r\n') + if line: # Process non-empty lines + match = progress_re.search(line) - # choose carriage return for in-progress or newline at completion - end_char = '\r' if progress < 1.0 else '\n' - print(display_line, end=end_char, file=sys.stderr, flush=True) - last_was_progress = (progress < 1.0) + if match: + progress = float(match.group(1)) - # skip noise lines - elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line): - # if we were in progress, finish that line first - if last_was_progress: - print(file=sys.stderr) + # Extract just the part from "prompt processing" onwards + prompt_processing_idx = line.find('prompt processing') + if prompt_processing_idx != -1: + display_line = line[prompt_processing_idx:] + else: + display_line = line # fallback to full line - print(line, file=sys.stderr, flush=True) - last_was_progress = False + # choose carriage return for in-progress or newline at completion + end_char = '\r' if progress < 1.0 else '\n' + print(display_line, end=end_char, file=sys.stderr, flush=True) + last_was_progress = (progress < 1.0) + + # skip noise lines + elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line): + # if we were in progress, finish that line first + if last_was_progress: + print(file=sys.stderr) + + print(line, file=sys.stderr, flush=True) + last_was_progress = False + + except Exception: + continue except (ValueError, IOError): - # silently ignore broken output or IO errors pass + finally: + try: + process_stderr.close() + except: + pass From 2478294c06dac4ec749f9d37532e1e258a322ee6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 24 Aug 2025 12:37:41 -0700 Subject: [PATCH 05/58] UI: Preload the instruct and chat fonts --- modules/block_requests.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/block_requests.py b/modules/block_requests.py index dc1ee467..618b4bd6 100644 --- a/modules/block_requests.py +++ b/modules/block_requests.py @@ -38,7 +38,6 @@ def my_get(url, **kwargs): return requests.api.request('get', 'http://127.0.0.1/', **kwargs) -# Kindly provided by our friend WizardLM-30B def my_open(*args, **kwargs): filename = str(args[0]) if filename.endswith(('index.html', 'share.html')): @@ -52,6 +51,10 @@ def my_open(*args, **kwargs): file_contents = file_contents.replace('cdnjs.cloudflare.com', '127.0.0.1') file_contents = file_contents.replace( '', + '\n ' + '\n ' + '\n ' + '\n ' '\n ' '\n ' '\n ' From 1f77427088a1487fd9afc8d43282aba6d51557b6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 24 Aug 2025 19:56:22 -0700 Subject: [PATCH 06/58] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index f1d40000..c53e7722 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 437a10d9..776dbc7b 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 0170d951..7205a72e 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 7b369c40..f4e00c5b 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 1d1f44e0..0b28ad21 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index e63e9705..fcbb48f0 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index c03a718a..1705791e 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 70b73e83..7e951219 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index ab91a763..9eb8b33c 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 0faa6502..0a27f61b 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index e1024942..79674e5a 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 3d2b6338..3ebb7d8b 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index a95b30b3..81b78fe6 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 2eb7f597..58dc529a 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 3244c9d4..5ad8ede1 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 685c7d1c..0adf6e48 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.38.0/llama_cpp_binaries-0.38.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 3bc48014a5bfdf633b814c23bbb5b42212293b06 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 16:48:21 -0700 Subject: [PATCH 07/58] chat.py code simplifications --- modules/chat.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index ab6b43c0..022ab8c9 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -213,13 +213,11 @@ def generate_chat_prompt(user_input, state, **kwargs): if assistant_msg: # Handle GPT-OSS as a special case if '<|channel|>analysis<|message|>' in assistant_msg or '<|channel|>final<|message|>' in assistant_msg: - thinking_content = "" final_content = "" # Extract analysis content if present if '<|channel|>analysis<|message|>' in assistant_msg: - # Split the message by the analysis tag to isolate the content that follows parts = assistant_msg.split('<|channel|>analysis<|message|>', 1) if len(parts) > 1: # The content is everything after the tag @@ -240,7 +238,6 @@ def generate_chat_prompt(user_input, state, **kwargs): # Extract final content if present final_tag_to_find = '<|channel|>final<|message|>' if final_tag_to_find in assistant_msg: - # Split the message by the final tag to isolate the content that follows parts = assistant_msg.split(final_tag_to_find, 1) if len(parts) > 1: # The content is everything after the tag @@ -261,6 +258,7 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.insert(insert_pos, msg_dict) else: + # Default case (used by all other models) messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']: @@ -286,18 +284,17 @@ def generate_chat_prompt(user_input, state, **kwargs): else: attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" - if image_refs or attachments_text: - enhanced_user_msg = user_msg - if image_refs: - enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}" - if attachments_text: - enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}" + if image_refs: + enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}" + if attachments_text: + enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}" messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg}) + # Handle the current user input user_input = user_input.strip() - # Check if we have attachments even with empty input + # Check if we have attachments has_attachments = False if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: current_row_idx = len(history) @@ -306,7 +303,7 @@ def generate_chat_prompt(user_input, state, **kwargs): if (user_input or has_attachments) and not impersonate and not _continue: # For the current user input being processed, check if we need to add attachments - if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: + if len(history_data.get('metadata', {})) > 0: current_row_idx = len(history) user_key = f"user_{current_row_idx}" @@ -325,12 +322,10 @@ def generate_chat_prompt(user_input, state, **kwargs): else: attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" - if image_refs or attachments_text: - user_input = user_input - if image_refs: - user_input = f"{image_refs}\n\n{user_input}" - if attachments_text: - user_input += f"\n\nATTACHMENTS:\n{attachments_text}" + if image_refs: + user_input = f"{image_refs}\n\n{user_input}" + if attachments_text: + user_input += f"\n\nATTACHMENTS:\n{attachments_text}" messages.append({"role": "user", "content": user_input}) From d08800c359bbc90172294a78f569cf284148d4b4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 17:03:37 -0700 Subject: [PATCH 08/58] chat.py improvements --- modules/chat.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 022ab8c9..cd82b813 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -336,10 +336,6 @@ def generate_chat_prompt(user_input, state, **kwargs): prompt = renderer(messages=messages) if state['mode'] == 'chat-instruct': - outer_messages = [] - if state['custom_system_message'].strip() != '': - outer_messages.append({"role": "system", "content": state['custom_system_message']}) - command = state['chat-instruct_command'] command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1']) command = command.replace('<|prompt|>', prompt) @@ -353,29 +349,31 @@ def generate_chat_prompt(user_input, state, **kwargs): if not impersonate: prefix = apply_extensions('bot_prefix', prefix, state) + suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1] + + outer_messages = [] + if state['custom_system_message'].strip() != '': + outer_messages.append({"role": "system", "content": state['custom_system_message']}) + outer_messages.append({"role": "user", "content": command}) outer_messages.append({"role": "assistant", "content": prefix}) prompt = instruct_renderer(messages=outer_messages) - suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1] if len(suffix) > 0: prompt = prompt[:-len(suffix)] else: # Handle GPT-OSS as a special case when continuing + # (otherwise the thinking block gets removed...) if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']: last_message_to_continue = messages[-1] prompt = renderer(messages=messages[:-1]) - # Start the assistant turn wrapper assistant_reply_so_far = "<|start|>assistant" - if 'thinking' in last_message_to_continue: assistant_reply_so_far += f"<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>" assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}" - prompt += assistant_reply_so_far - else: prompt = renderer(messages=messages) if _continue: From f919cdf881ee45641d588fc664d2c4fe1cc71c4a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 17:20:51 -0700 Subject: [PATCH 09/58] chat.py code simplifications --- modules/chat.py | 128 +++++++++++++++--------------------------------- 1 file changed, 40 insertions(+), 88 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index cd82b813..023f5a3e 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -86,36 +86,6 @@ yaml.add_representer(str, str_presenter) yaml.representer.SafeRepresenter.add_representer(str, str_presenter) -def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True): - ''' - Given a Jinja template, reverse-engineers the prefix and the suffix for - an assistant message (if impersonate=False) or an user message - (if impersonate=True) - ''' - - if impersonate: - messages = [ - {"role": "user", "content": "<<|user-message-1|>>"}, - {"role": "user", "content": "<<|user-message-2|>>"}, - ] - else: - messages = [ - {"role": "assistant", "content": "<<|user-message-1|>>"}, - {"role": "assistant", "content": "<<|user-message-2|>>"}, - ] - - prompt = renderer(messages=messages) - - suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0] - suffix = prompt.split("<<|user-message-2|>>")[1] - prefix = suffix_plus_prefix[len(suffix):] - - if strip_trailing_spaces: - prefix = prefix.rstrip(' ') - - return prefix, suffix - - def get_thinking_suppression_string(template): """ Determines what string needs to be added to suppress thinking mode @@ -341,26 +311,16 @@ def generate_chat_prompt(user_input, state, **kwargs): command = command.replace('<|prompt|>', prompt) command = replace_character_names(command, state['name1'], state['name2']) - if _continue: - prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0] - prefix += messages[-1]["content"] - else: - prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] - if not impersonate: - prefix = apply_extensions('bot_prefix', prefix, state) - - suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1] - outer_messages = [] if state['custom_system_message'].strip() != '': outer_messages.append({"role": "system", "content": state['custom_system_message']}) outer_messages.append({"role": "user", "content": command}) - outer_messages.append({"role": "assistant", "content": prefix}) - prompt = instruct_renderer(messages=outer_messages) - if len(suffix) > 0: - prompt = prompt[:-len(suffix)] + prompt = instruct_renderer( + messages=outer_messages, + add_generation_prompt=True + ) else: # Handle GPT-OSS as a special case when continuing # (otherwise the thinking block gets removed...) @@ -375,29 +335,10 @@ def generate_chat_prompt(user_input, state, **kwargs): assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}" prompt += assistant_reply_so_far else: - prompt = renderer(messages=messages) - if _continue: - suffix = get_generation_prompt(renderer, impersonate=impersonate)[1] - if len(suffix) > 0: - prompt = prompt[:-len(suffix)] - else: - prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] - - # Handle GPT-OSS as a special case when not continuing - if '<|channel|>final<|message|>' in state['instruction_template_str']: - if prefix.endswith("<|channel|>final<|message|>"): - prefix = prefix[:-len("<|channel|>final<|message|>")] - - if impersonate: - prefix += "<|message|>" - - if state['mode'] == 'chat' and not impersonate: - prefix = apply_extensions('bot_prefix', prefix, state) - - prompt += prefix - - if state['mode'] == 'instruct' and 'enable_thinking' in state['instruction_template_str'] and not any((_continue, impersonate, state['enable_thinking'])): - prompt += get_thinking_suppression_string(instruction_template) + prompt = renderer( + messages=messages, + add_generation_prompt=True + ) return prompt @@ -523,24 +464,41 @@ def get_stopping_strings(state): renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2']) renderers.append(renderer) - for renderer in renderers: - prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False) - prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True) + fake_messages = [ + {"role": "user", "content": "first user message"}, + {"role": "assistant", "content": "first assistant message"}, + {"role": "user", "content": "second user message"}, + {"role": "assistant", "content": "second assistant message"}, + ] - stopping_strings += [ - suffix_user + prefix_bot, - suffix_user + prefix_user, - suffix_bot + prefix_bot, - suffix_bot + prefix_user, + stopping_strings = [] + for renderer in renderers: + prompt = renderer(messages=fake_messages) + + # Find positions of each message content + first_user_end = prompt.find("first user message") + len("first user message") + first_assistant_start = prompt.find("first assistant message") + first_assistant_end = prompt.find("first assistant message") + len("first assistant message") + second_user_start = prompt.find("second user message") + second_assistant_end = prompt.find("second assistant message") + len("second assistant message") + + # Extract pieces of text potentially containing unique stopping strings + texts = [ + prompt[first_user_end:first_assistant_start], + prompt[first_assistant_end:second_user_start], + prompt[second_assistant_end:] ] - # Try to find the EOT token - for item in stopping_strings.copy(): - item = item.strip() - if item.startswith("<") and ">" in item: - stopping_strings.append(item.split(">")[0] + ">") - elif item.startswith("[") and "]" in item: - stopping_strings.append(item.split("]")[0] + "]") + for text in texts: + text = text.strip() + if text.startswith("<") and ">" in text: + stopping_strings.append(text.split(">")[0] + ">") + elif text.startswith("[") and "]" in text: + stopping_strings.append(text.split("]")[0] + "]") + elif text.startswith("(") and ")" in text: + stopping_strings.append(text.split(")")[0] + ")") + elif text.startswith("{") and "}" in text: + stopping_strings.append(text.split("}")[0] + "}") if 'stopping_strings' in state and isinstance(state['stopping_strings'], list): stopping_strings += state.pop('stopping_strings') @@ -549,12 +507,6 @@ def get_stopping_strings(state): result = [item for item in stopping_strings if not any(item.startswith(other) and item != other for other in stopping_strings)] result = list(set(result)) - # Handle GPT-OSS as a special case - if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result: - result.remove("<|end|>") - result.append("<|result|>") - result = list(set(result)) - if shared.args.verbose: logger.info("STOPPING_STRINGS=") pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(result) From aad0104c1b536d62c19e59f4afc5a90c703f169f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 17:33:13 -0700 Subject: [PATCH 10/58] Remove a function --- modules/chat.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 023f5a3e..05ed02bf 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -86,44 +86,6 @@ yaml.add_representer(str, str_presenter) yaml.representer.SafeRepresenter.add_representer(str, str_presenter) -def get_thinking_suppression_string(template): - """ - Determines what string needs to be added to suppress thinking mode - by comparing template renderings with thinking enabled vs disabled. - """ - - # Render with thinking enabled - with_thinking = template.render( - messages=[{'role': 'user', 'content': ''}], - builtin_tools=None, - tools=None, - tools_in_user_message=False, - add_generation_prompt=True, - enable_thinking=True - ) - - # Render with thinking disabled - without_thinking = template.render( - messages=[{'role': 'user', 'content': ''}], - builtin_tools=None, - tools=None, - tools_in_user_message=False, - add_generation_prompt=True, - enable_thinking=False - ) - - # Find the difference (what gets added to suppress thinking) - i = 0 - while i < min(len(with_thinking), len(without_thinking)) and with_thinking[i] == without_thinking[i]: - i += 1 - - j = 0 - while j < min(len(with_thinking), len(without_thinking)) - i and with_thinking[-1 - j] == without_thinking[-1 - j]: - j += 1 - - return without_thinking[i:len(without_thinking) - j if j else None] - - def generate_chat_prompt(user_input, state, **kwargs): impersonate = kwargs.get('impersonate', False) _continue = kwargs.get('_continue', False) From adeca8a65888f97b94dfdaff6b2492c031ec1ccd Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 17:36:01 -0700 Subject: [PATCH 11/58] Remove changes to the jinja2 templates --- modules/models_settings.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index aa16fdb9..7645880f 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -94,8 +94,6 @@ def get_model_metadata(model): template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) template = re.sub(r'raise_exception\([^)]*\)', "''", template) - template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) - template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template @@ -172,8 +170,6 @@ def get_model_metadata(model): template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) template = re.sub(r'raise_exception\([^)]*\)', "''", template) - template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) - template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template From 3ad59703748dcd5685dcbb7368df45914661e8da Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 17:43:21 -0700 Subject: [PATCH 12/58] Make the llama.cpp --verbose output less verbose --- modules/llama_cpp_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index e3dd43b4..8579f843 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -193,7 +193,7 @@ class LlamaServer: if shared.args.verbose: logger.info("GENERATE_PARAMS=") - printable_payload = {k: (v if k != "prompt" else "[multimodal object]" if pil_images else v) for k, v in payload.items()} + printable_payload = {k: v for k, v in payload.items() if k != "prompt"} pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() From b330ec35174f6b1b7e26922bdeef16069441bd8a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 17:54:15 -0700 Subject: [PATCH 13/58] Simplifications --- modules/chat.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 05ed02bf..530e3a0a 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -108,6 +108,7 @@ def generate_chat_prompt(user_input, state, **kwargs): tools=state['tools'] if 'tools' in state else None, tools_in_user_message=False, add_generation_prompt=False, + enable_thinking=state['enable_thinking'], reasoning_effort=state['reasoning_effort'] ) @@ -262,10 +263,10 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.append({"role": "user", "content": user_input}) def make_prompt(messages): - if state['mode'] == 'chat-instruct' and _continue: - prompt = renderer(messages=messages[:-1]) - else: - prompt = renderer(messages=messages) + prompt = renderer( + messages=messages[:-1] if _continue else messages, + add_generation_prompt=(state['mode'] != 'chat-instruct') + ) if state['mode'] == 'chat-instruct': command = state['chat-instruct_command'] @@ -287,20 +288,15 @@ def generate_chat_prompt(user_input, state, **kwargs): # Handle GPT-OSS as a special case when continuing # (otherwise the thinking block gets removed...) if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']: - last_message_to_continue = messages[-1] - prompt = renderer(messages=messages[:-1]) - assistant_reply_so_far = "<|start|>assistant" - if 'thinking' in last_message_to_continue: - assistant_reply_so_far += f"<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>" + if 'thinking' in messages[-1]: + assistant_reply_so_far += f"<|channel|>analysis<|message|>{messages[-1]['thinking']}<|end|>" - assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}" + assistant_reply_so_far += f"<|channel|>final<|message|>" prompt += assistant_reply_so_far - else: - prompt = renderer( - messages=messages, - add_generation_prompt=True - ) + + if _continue: + prompt += messages[-1].get('content', '') return prompt From c1aa4590ea3d69ba9ae8edd3bf222af27a3cd13b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 18:05:40 -0700 Subject: [PATCH 14/58] Code simplifications, fix impersonate --- modules/chat.py | 60 ++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 530e3a0a..7c2ab4a3 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -228,44 +228,48 @@ def generate_chat_prompt(user_input, state, **kwargs): user_input = user_input.strip() # Check if we have attachments - has_attachments = False - if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: - current_row_idx = len(history) - user_key = f"user_{current_row_idx}" - has_attachments = user_key in metadata and "attachments" in metadata[user_key] - - if (user_input or has_attachments) and not impersonate and not _continue: - # For the current user input being processed, check if we need to add attachments + if not (impersonate or _continue): + has_attachments = False if len(history_data.get('metadata', {})) > 0: current_row_idx = len(history) user_key = f"user_{current_row_idx}" + has_attachments = user_key in metadata and "attachments" in metadata[user_key] - if user_key in metadata and "attachments" in metadata[user_key]: - attachments_text = "" - image_refs = "" + if user_input or has_attachments: + # For the current user input being processed, check if we need to add attachments + if len(history_data.get('metadata', {})) > 0: + current_row_idx = len(history) + user_key = f"user_{current_row_idx}" - for attachment in metadata[user_key]["attachments"]: - if attachment.get("type") == "image": - image_refs += "<__media__>" - else: - filename = attachment.get("name", "file") - content = attachment.get("content", "") - if attachment.get("type") == "text/html" and attachment.get("url"): - attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n" + if user_key in metadata and "attachments" in metadata[user_key]: + attachments_text = "" + image_refs = "" + + for attachment in metadata[user_key]["attachments"]: + if attachment.get("type") == "image": + image_refs += "<__media__>" else: - attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + filename = attachment.get("name", "file") + content = attachment.get("content", "") + if attachment.get("type") == "text/html" and attachment.get("url"): + attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n" + else: + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" - if image_refs: - user_input = f"{image_refs}\n\n{user_input}" - if attachments_text: - user_input += f"\n\nATTACHMENTS:\n{attachments_text}" + if image_refs: + user_input = f"{image_refs}\n\n{user_input}" + if attachments_text: + user_input += f"\n\nATTACHMENTS:\n{attachments_text}" - messages.append({"role": "user", "content": user_input}) + messages.append({"role": "user", "content": user_input}) + + if impersonate: + messages.append({"role": "user", "content": "fake user message replace me"}) def make_prompt(messages): prompt = renderer( messages=messages[:-1] if _continue else messages, - add_generation_prompt=(state['mode'] != 'chat-instruct') + add_generation_prompt=(state['mode'] != 'chat-instruct' and not impersonate) ) if state['mode'] == 'chat-instruct': @@ -298,6 +302,10 @@ def generate_chat_prompt(user_input, state, **kwargs): if _continue: prompt += messages[-1].get('content', '') + if impersonate: + prompt = prompt.split("fake user message replace me", 1)[0] + prompt += user_input + return prompt prompt = make_prompt(messages) From ded6c41cf8b5a95441516515f669766407d9692d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 18:16:17 -0700 Subject: [PATCH 15/58] Fix impersonate for chat-instruct --- modules/chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 7c2ab4a3..d2513e07 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -263,7 +263,7 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.append({"role": "user", "content": user_input}) - if impersonate: + if impersonate and state['mode'] != 'chat-instruct': messages.append({"role": "user", "content": "fake user message replace me"}) def make_prompt(messages): From b657be73814329d9d8d81f1cec49fe7c738dc3ee Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 18:22:08 -0700 Subject: [PATCH 16/58] Obtain stopping strings in chat mode --- modules/chat.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index d2513e07..8a9a5a1b 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -456,15 +456,17 @@ def get_stopping_strings(state): ] for text in texts: - text = text.strip() - if text.startswith("<") and ">" in text: - stopping_strings.append(text.split(">")[0] + ">") - elif text.startswith("[") and "]" in text: - stopping_strings.append(text.split("]")[0] + "]") - elif text.startswith("(") and ")" in text: - stopping_strings.append(text.split(")")[0] + ")") - elif text.startswith("{") and "}" in text: - stopping_strings.append(text.split("}")[0] + "}") + stripped_text = text.strip() + if stripped_text.startswith("<") and ">" in stripped_text: + stopping_strings.append(stripped_text.split(">")[0] + ">") + elif stripped_text.startswith("[") and "]" in stripped_text: + stopping_strings.append(stripped_text.split("]")[0] + "]") + elif stripped_text.startswith("(") and ")" in stripped_text: + stopping_strings.append(stripped_text.split(")")[0] + ")") + elif stripped_text.startswith("{") and "}" in stripped_text: + stopping_strings.append(stripped_text.split("}")[0] + "}") + elif ":" in text: + stopping_strings.append(text.split(":")[0] + ":") if 'stopping_strings' in state and isinstance(state['stopping_strings'], list): stopping_strings += state.pop('stopping_strings') From 6c165d2e55f41f6e0259e2175b9c7314b28a221f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 18:28:43 -0700 Subject: [PATCH 17/58] Fix the chat template --- modules/shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/shared.py b/modules/shared.py index 644261a0..c3d96b70 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -287,7 +287,7 @@ settings = { 'greeting': 'How can I help you today?', 'custom_system_message': '', 'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}", - 'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}", + 'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n {{- name2 + ': ' -}}\n{%- endif %}", # Extensions 'default_extensions': [], From a531328f7eef0d7e6f4ac85186409ee2320586ee Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 18:41:58 -0700 Subject: [PATCH 18/58] Fix the GPT-OSS stopping string --- modules/chat.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/chat.py b/modules/chat.py index 8a9a5a1b..96d36ba5 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -475,6 +475,12 @@ def get_stopping_strings(state): result = [item for item in stopping_strings if not any(item.startswith(other) and item != other for other in stopping_strings)] result = list(set(result)) + # Handle GPT-OSS as a special case + if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result: + result.remove("<|end|>") + result.append("<|result|>") + result = list(set(result)) + if shared.args.verbose: logger.info("STOPPING_STRINGS=") pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(result) From 8f660aefe361d396847ebce03d86f0e501561c17 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 18:50:16 -0700 Subject: [PATCH 19/58] Fix chat-instruct replies leaking the bot name sometimes --- modules/chat.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/chat.py b/modules/chat.py index 96d36ba5..a24a5be1 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -825,6 +825,12 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess # Extract the reply if state['mode'] in ['chat', 'chat-instruct']: + reply = reply.lstrip() + if reply.startswith(state['name2'] + ':'): + reply = reply[len(state['name2'] + ':'):] + elif reply.startswith(state['name1'] + ':'): + reply = reply[len(state['name1'] + ':'):] + visible_reply = re.sub("(||{{user}})", state['name1'], reply) else: visible_reply = reply From 8fcb4b310242a37b58bd006c28b7bb29a688e767 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 19:10:46 -0700 Subject: [PATCH 20/58] Make bot_prefix extensions functional again --- modules/chat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/chat.py b/modules/chat.py index a24a5be1..5eb9f301 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -306,6 +306,9 @@ def generate_chat_prompt(user_input, state, **kwargs): prompt = prompt.split("fake user message replace me", 1)[0] prompt += user_input + if state['mode'] in ['chat', 'chat-instruct'] and not impersonate and not _continue: + prompt += apply_extensions('bot_prefix', "", state) + return prompt prompt = make_prompt(messages) From 6a7166fffaa4361a923cf9d6a15a2f6b96a8be6d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 19:46:48 -0700 Subject: [PATCH 21/58] Add support for the Seed-OSS template --- modules/chat.py | 29 ++++++++++++++++++++++++++++- modules/html_generator.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 5eb9f301..818d1014 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -109,7 +109,8 @@ def generate_chat_prompt(user_input, state, **kwargs): tools_in_user_message=False, add_generation_prompt=False, enable_thinking=state['enable_thinking'], - reasoning_effort=state['reasoning_effort'] + reasoning_effort=state['reasoning_effort'], + thinking_budget=-1 if state.get('enable_thinking', True) else 0 ) chat_renderer = partial( @@ -190,6 +191,30 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.insert(insert_pos, msg_dict) + # Handle Seed-OSS + elif '' in assistant_msg: + thinking_content = "" + final_content = assistant_msg + + # Extract thinking content if present + if '' in assistant_msg: + parts = assistant_msg.split('', 1) + if len(parts) > 1: + potential_content = parts[1] + if '' in potential_content: + thinking_content = potential_content.split('', 1)[0].strip() + final_content = parts[0] + potential_content.split('', 1)[1] + else: + thinking_content = potential_content.strip() + final_content = parts[0] + + # Insert as structured message + msg_dict = {"role": "assistant", "content": final_content.strip()} + if thinking_content: + msg_dict["reasoning_content"] = thinking_content + + messages.insert(insert_pos, msg_dict) + else: # Default case (used by all other models) messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) @@ -687,6 +712,8 @@ def generate_search_query(user_message, state): query = query.rsplit("", 1)[1] elif "<|start|>assistant<|channel|>final<|message|>" in query: query = query.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1] + elif "" in query: + query = query.rsplit("", 1)[1] # Strip and remove surrounding quotes if present query = query.strip() diff --git a/modules/html_generator.py b/modules/html_generator.py index 279f9ba6..63844f35 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -137,7 +137,7 @@ def extract_thinking_block(string): remaining_content = string[content_start:] return thinking_content, remaining_content - # If think tags not found, try alternative format + # If think tags not found, try GPT-OSS alternative format ALT_START = "<|channel|>analysis<|message|>" ALT_END = "<|end|>" ALT_CONTENT_START = "<|start|>assistant<|channel|>final<|message|>" @@ -168,7 +168,31 @@ def extract_thinking_block(string): remaining_content = string[content_start:] return thinking_content, remaining_content - # Return if neither format is found + # Try seed:think format + SEED_START = "<seed:think>" + SEED_END = "</seed:think>" + + seed_start_pos = string.find(SEED_START) + seed_end_pos = string.find(SEED_END) + + if seed_start_pos != -1 or seed_end_pos != -1: + if seed_start_pos == -1: + thought_start = 0 + else: + thought_start = seed_start_pos + len(SEED_START) + + if seed_end_pos == -1: + thought_end = len(string) + content_start = len(string) + else: + thought_end = seed_end_pos + content_start = seed_end_pos + len(SEED_END) + + thinking_content = string[thought_start:thought_end] + remaining_content = string[content_start:] + return thinking_content, remaining_content + + # Return if no format is found return None, string From 02ca96fa44e0a29eb52aad46fafd0995e1d91d42 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 25 Aug 2025 22:17:22 -0700 Subject: [PATCH 22/58] Multiple fixes --- modules/chat.py | 38 +++++++++++++++++++++++--------------- modules/shared.py | 4 ++-- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 818d1014..3c61a0dd 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -292,9 +292,22 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.append({"role": "user", "content": "fake user message replace me"}) def make_prompt(messages): + last_message = messages[-1].copy() + if _continue: + if state['mode'] == 'chat-instruct': + messages = messages[:-1] + else: + messages[-1]["content"] = "fake assistant message replace me" + messages.append({"role": "assistant", "content": "this will get deleted"}) + + if state['mode'] != 'chat-instruct': + add_generation_prompt = (not _continue and not impersonate) + else: + add_generation_prompt = False + prompt = renderer( - messages=messages[:-1] if _continue else messages, - add_generation_prompt=(state['mode'] != 'chat-instruct' and not impersonate) + messages=messages, + add_generation_prompt=add_generation_prompt ) if state['mode'] == 'chat-instruct': @@ -308,24 +321,19 @@ def generate_chat_prompt(user_input, state, **kwargs): outer_messages.append({"role": "system", "content": state['custom_system_message']}) outer_messages.append({"role": "user", "content": command}) + if _continue: + outer_messages.append(last_message.copy()) + outer_messages[-1]["content"] = "fake assistant message replace me" + outer_messages.append({"role": "assistant", "content": "this will get deleted"}) prompt = instruct_renderer( messages=outer_messages, - add_generation_prompt=True + add_generation_prompt=not _continue ) - else: - # Handle GPT-OSS as a special case when continuing - # (otherwise the thinking block gets removed...) - if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']: - assistant_reply_so_far = "<|start|>assistant" - if 'thinking' in messages[-1]: - assistant_reply_so_far += f"<|channel|>analysis<|message|>{messages[-1]['thinking']}<|end|>" - - assistant_reply_so_far += f"<|channel|>final<|message|>" - prompt += assistant_reply_so_far if _continue: - prompt += messages[-1].get('content', '') + prompt = prompt.split("fake assistant message replace me", 1)[0] + prompt += last_message.get("content", "") if impersonate: prompt = prompt.split("fake user message replace me", 1)[0] @@ -453,7 +461,7 @@ def get_stopping_strings(state): renderer = partial(template.render, add_generation_prompt=False) renderers.append(renderer) - if state['mode'] in ['chat', 'chat-instruct']: + if state['mode'] in ['chat']: template = jinja_env.from_string(state['chat_template_str']) renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2']) renderers.append(renderer) diff --git a/modules/shared.py b/modules/shared.py index c3d96b70..3e72acca 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -203,7 +203,7 @@ settings = { 'start_with': '', 'mode': 'instruct', 'chat_style': 'cai-chat', - 'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>', + 'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>". Reply directly, without starting the reply with the character name.\n\n<|prompt|>', 'enable_web_search': False, 'web_search_pages': 3, 'prompt-notebook': '', @@ -287,7 +287,7 @@ settings = { 'greeting': 'How can I help you today?', 'custom_system_message': '', 'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}", - 'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n {{- name2 + ': ' -}}\n{%- endif %}", + 'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n {{- name2 + ':' -}}\n{%- endif %}", # Extensions 'default_extensions': [], From 750adf793dcf1bc4c5140c84f76b932dc454c194 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 26 Aug 2025 11:58:49 -0700 Subject: [PATCH 23/58] UI: Preserve chat scroll position on textarea resize --- js/main.js | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/js/main.js b/js/main.js index 4b4b14c2..4ada64f6 100644 --- a/js/main.js +++ b/js/main.js @@ -1065,3 +1065,57 @@ document.fonts.addEventListener("loadingdone", (event) => { } }, 50); }); + +//------------------------------------------------ +// Preserve chat scroll position on textarea resize +//------------------------------------------------ +(function() { + let chatParent = null; + let initialState = null; + let debounceTimeout = null; + + function getChatParent() { + if (!chatParent) chatParent = document.querySelector(".chat-parent"); + return chatParent; + } + + function getTextarea() { + return document.querySelector("#chat-input textarea"); + } + + document.addEventListener("input", function(e) { + if (e.target.matches("#chat-input textarea")) { + const chat = getChatParent(); + const textarea = getTextarea(); + + if (chat && textarea) { + // Capture initial state only on first input of a typing sequence + if (!initialState) { + initialState = { + scrollTop: chat.scrollTop, + textareaHeight: textarea.offsetHeight + }; + } + + // Clear existing timeout + clearTimeout(debounceTimeout); + + // Wait for typing to stop (50ms delay) + debounceTimeout = setTimeout(() => { + const finalTextareaHeight = textarea.offsetHeight; + const totalGrowth = finalTextareaHeight - initialState.textareaHeight; + const targetScroll = initialState.scrollTop + totalGrowth; + + const restore = () => { chat.scrollTop = targetScroll; }; + + restore(); + requestAnimationFrame(restore); + setTimeout(restore, 0); + setTimeout(restore, 10); + + initialState = null; + }, 50); + } + } + }, true); +})(); From ccc8a2229dd82ae8d77274341785e043f6f3a343 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 26 Aug 2025 13:59:54 -0700 Subject: [PATCH 24/58] Revert "UI: Preserve chat scroll position on textarea resize" This reverts commit 750adf793dcf1bc4c5140c84f76b932dc454c194. --- js/main.js | 54 ------------------------------------------------------ 1 file changed, 54 deletions(-) diff --git a/js/main.js b/js/main.js index 4ada64f6..4b4b14c2 100644 --- a/js/main.js +++ b/js/main.js @@ -1065,57 +1065,3 @@ document.fonts.addEventListener("loadingdone", (event) => { } }, 50); }); - -//------------------------------------------------ -// Preserve chat scroll position on textarea resize -//------------------------------------------------ -(function() { - let chatParent = null; - let initialState = null; - let debounceTimeout = null; - - function getChatParent() { - if (!chatParent) chatParent = document.querySelector(".chat-parent"); - return chatParent; - } - - function getTextarea() { - return document.querySelector("#chat-input textarea"); - } - - document.addEventListener("input", function(e) { - if (e.target.matches("#chat-input textarea")) { - const chat = getChatParent(); - const textarea = getTextarea(); - - if (chat && textarea) { - // Capture initial state only on first input of a typing sequence - if (!initialState) { - initialState = { - scrollTop: chat.scrollTop, - textareaHeight: textarea.offsetHeight - }; - } - - // Clear existing timeout - clearTimeout(debounceTimeout); - - // Wait for typing to stop (50ms delay) - debounceTimeout = setTimeout(() => { - const finalTextareaHeight = textarea.offsetHeight; - const totalGrowth = finalTextareaHeight - initialState.textareaHeight; - const targetScroll = initialState.scrollTop + totalGrowth; - - const restore = () => { chat.scrollTop = targetScroll; }; - - restore(); - requestAnimationFrame(restore); - setTimeout(restore, 0); - setTimeout(restore, 10); - - initialState = null; - }, 50); - } - } - }, true); -})(); From 8042f76399f9aa84c4d16dd10fb9ddf3d01238a4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 Aug 2025 05:37:01 -0700 Subject: [PATCH 25/58] Make portable installs functional with Python 3.13 --- requirements/full/requirements.txt | 1 + requirements/full/requirements_amd.txt | 1 + requirements/full/requirements_amd_noavx2.txt | 1 + requirements/full/requirements_apple_intel.txt | 1 + requirements/full/requirements_apple_silicon.txt | 1 + requirements/full/requirements_cpu_only.txt | 1 + requirements/full/requirements_cpu_only_noavx2.txt | 1 + requirements/full/requirements_noavx2.txt | 1 + requirements/full/requirements_nowheels.txt | 1 + requirements/portable/requirements.txt | 1 + requirements/portable/requirements_apple_intel.txt | 1 + requirements/portable/requirements_apple_silicon.txt | 1 + requirements/portable/requirements_cpu_only.txt | 1 + requirements/portable/requirements_cpu_only_noavx2.txt | 1 + requirements/portable/requirements_noavx2.txt | 1 + requirements/portable/requirements_nowheels.txt | 1 + requirements/portable/requirements_vulkan.txt | 1 + requirements/portable/requirements_vulkan_noavx2.txt | 1 + 18 files changed, 18 insertions(+) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index c53e7722..77ddc8fb 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" bitsandbytes==0.46.* colorama datasets diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 776dbc7b..802f6724 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 7205a72e..bbb4fa59 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index f4e00c5b..b721bcce 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 0b28ad21..80b168d2 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index fcbb48f0..5bfcdea6 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 1705791e..31743a21 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 7e951219..0d04d229 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" bitsandbytes==0.46.* colorama datasets diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index cd85a744..74d86047 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 9eb8b33c..ca0c4017 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 0a27f61b..b5a853ba 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 79674e5a..995f5f26 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 3ebb7d8b..3b5d9442 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 81b78fe6..4bc705ec 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 58dc529a..a4dc4de9 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index b7b73eff..be624bb1 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 5ad8ede1..4367e180 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 0adf6e48..2130efcc 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 From 73442a2b6d0f2de333c26cbdde862f3f7b84d8a8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 Aug 2025 05:43:13 -0700 Subject: [PATCH 26/58] UI: Better handle the chat input position with CSS This also solves scrolling issues with the main chat content when the height of the textarea increases. --- css/chat_style-messenger.css | 2 ++ css/main.css | 4 ++++ js/main.js | 27 +++++++++++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css index 583703c0..70fd6d4a 100644 --- a/css/chat_style-messenger.css +++ b/css/chat_style-messenger.css @@ -99,9 +99,11 @@ .message-body p em { color: rgb(110 110 110) !important; } + .editing-textarea { width: max(30rem) !important; } + .circle-you + .text .edit-control-button, .circle-you + .text .editing-textarea { color: #000 !important; } diff --git a/css/main.css b/css/main.css index 062d3eb2..b799f595 100644 --- a/css/main.css +++ b/css/main.css @@ -404,6 +404,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { flex: 1; overflow: auto !important; border-radius: 0 !important; + margin-bottom: 75px; } .chat-parent .prose { @@ -626,6 +627,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { max-width: 54rem; left: 50%; transform: translateX(-50%); + position: absolute; + bottom: 0; + background: var(--body-background-fill); } @media print { diff --git a/js/main.js b/js/main.js index 4b4b14c2..9b9d685a 100644 --- a/js/main.js +++ b/js/main.js @@ -1065,3 +1065,30 @@ document.fonts.addEventListener("loadingdone", (event) => { } }, 50); }); + +(function() { + const chatParent = document.querySelector(".chat-parent"); + const chatInputRow = document.querySelector("#chat-input-row"); + const originalMarginBottom = 75; + let originalHeight = chatInputRow.offsetHeight; + + function updateMargin() { + const currentHeight = chatInputRow.offsetHeight; + const heightDifference = currentHeight - originalHeight; + chatParent.style.marginBottom = `${originalMarginBottom + heightDifference}px`; + } + + // Watch for changes that might affect height + const observer = new MutationObserver(updateMargin); + observer.observe(chatInputRow, { + childList: true, + subtree: true, + attributes: true + }); + + // Also listen for window resize + window.addEventListener("resize", updateMargin); + + // Initial call to set the margin based on current state + updateMargin(); +})(); From 0b4518e61cfe7993017c54d09328dd364301128f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 Aug 2025 05:53:09 -0700 Subject: [PATCH 27/58] "Text generation web UI" -> "Text Generation Web UI" --- README.md | 4 ++-- modules/shared.py | 2 +- server.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 6b49cee0..d42697dd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Text generation web UI +# Text Generation Web UI A Gradio web UI for Large Language Models. @@ -238,7 +238,7 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [- [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] -Text generation web UI +Text Generation Web UI options: -h, --help show this help message and exit diff --git a/modules/shared.py b/modules/shared.py index 3e72acca..a3085239 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -31,7 +31,7 @@ persistent_interface_state = {} need_restart = False # Parser copied from https://github.com/vladmandic/automatic -parser = argparse.ArgumentParser(description="Text generation web UI", conflict_handler='resolve', add_help=True, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=55, indent_increment=2, width=200)) +parser = argparse.ArgumentParser(description="Text Generation Web UI", conflict_handler='resolve', add_help=True, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=55, indent_increment=2, width=200)) # Basic settings group = parser.add_argument_group('Basic settings') diff --git a/server.py b/server.py index e6687a3c..52463a3c 100644 --- a/server.py +++ b/server.py @@ -70,7 +70,7 @@ from modules.utils import gradio def signal_handler(sig, frame): - logger.info("Received Ctrl+C. Shutting down Text generation web UI gracefully.") + logger.info("Received Ctrl+C. Shutting down Text Generation Web UI gracefully.") # Explicitly stop LlamaServer to avoid __del__ cleanup issues during shutdown if shared.model and shared.model.__class__.__name__ == 'LlamaServer': @@ -87,7 +87,7 @@ signal.signal(signal.SIGINT, signal_handler) def create_interface(): - title = 'Text generation web UI' + title = 'Text Generation Web UI' # Password authentication auth = [] @@ -230,7 +230,7 @@ def create_interface(): if __name__ == "__main__": - logger.info("Starting Text generation web UI") + logger.info("Starting Text Generation Web UI") do_cmd_flags_warnings() # Load custom settings From 030ba7bfeb0e7aed7c8a176e13cc64cd75489d23 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 Aug 2025 07:44:35 -0700 Subject: [PATCH 28/58] UI: Mention that Seed-OSS uses enable_thinking --- modules/ui_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 94c980bb..1dbac13b 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -81,7 +81,7 @@ def create_ui(): gr.HTML("
") shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.') - shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.') + shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS, pre-2507 Qwen3.') gr.HTML("
") From a92758a1444626167468f0b0552a642b1e9245a2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 Aug 2025 16:15:20 -0700 Subject: [PATCH 29/58] llama.cpp: Fix obtaining the maximum sequence length for GPT-OSS --- modules/models_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 7645880f..6dc000b4 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -68,7 +68,7 @@ def get_model_metadata(model): metadata = load_gguf_metadata_with_cache(model_file) for k in metadata: - if k.endswith('context_length'): + if k.endswith('.context_length'): model_settings['ctx_size'] = min(metadata[k], 8192) model_settings['truncation_length_info'] = metadata[k] elif k.endswith('rope.freq_base'): From ba6041251d200dfffaf6ea46dd492554a254b241 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 28 Aug 2025 06:20:00 -0700 Subject: [PATCH 30/58] UI: Minor change --- modules/ui_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 1dbac13b..31a7a4fc 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -81,7 +81,7 @@ def create_ui(): gr.HTML("
") shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.') - shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS, pre-2507 Qwen3.') + shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.') gr.HTML("
") From a336a8bbeb53136c40040be8d7e18e79eec034df Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 28 Aug 2025 08:26:40 -0700 Subject: [PATCH 31/58] UI: Fix italic and quote color in headings --- css/html_instruct_style.css | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 3e5ebe67..22901c4d 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -13,7 +13,9 @@ line-height: 28px !important; } -.dark .chat .message-body :is(p, li, q, em, h1, h2, h3, h4, h5, h6) { +.dark .chat .message-body :is(p,li,h1,h2,h3,h4,h5,h6), +.dark .chat .message-body em:not(:is(h1,h2,h3,h4,h5,h6) em), +.dark .chat .message-body q:not(:is(h1,h2,h3,h4,h5,h6) q) { color: #d1d5db !important; } From cfc83745ec96ad963282620524f94b08776de5b6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 28 Aug 2025 08:34:48 -0700 Subject: [PATCH 32/58] UI: Improve right sidebar borders in light mode --- css/main.css | 9 +++++++++ modules/ui_chat.py | 8 ++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/css/main.css b/css/main.css index b799f595..a7ed2534 100644 --- a/css/main.css +++ b/css/main.css @@ -1697,3 +1697,12 @@ button:focus { #chat-input span { display: none; } + +.sidebar-vertical-separator { + margin: 0; + border-bottom: var(--input-border-width) solid var(--input-border-color); +} + +.dark .sidebar-vertical-separator { + border-bottom: 1px solid rgba(255,255,255,0.1); +} diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 31a7a4fc..1e8218a9 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -78,18 +78,18 @@ def create_ui(): with gr.Row(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) - gr.HTML("
") + gr.HTML("") shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.') shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.') - gr.HTML("
") + gr.HTML("") shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search') with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']: shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10) - gr.HTML("
") + gr.HTML("") with gr.Row(): shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') @@ -100,7 +100,7 @@ def create_ui(): with gr.Row(): shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar']) - gr.HTML("
") + gr.HTML("") with gr.Row(): shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm') From cb8780a4ce617b9a53d0ffb535f6b18b82b5f3bf Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 28 Aug 2025 11:13:19 -0700 Subject: [PATCH 33/58] Safer check for is_multimodal when loading models Avoids unrelated multimodal error when a model fails to load due to lack of memory. --- modules/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/models.py b/modules/models.py index cae88ac5..133131d7 100644 --- a/modules/models.py +++ b/modules/models.py @@ -57,7 +57,7 @@ def load_model(model_name, loader=None): shared.settings['truncation_length'] = shared.args.ctx_size shared.is_multimodal = False - if loader.lower() in ('exllamav3', 'llama.cpp'): + if loader.lower() in ('exllamav3', 'llama.cpp') and hasattr(model, 'is_multimodal'): shared.is_multimodal = model.is_multimodal() logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.") From d9eec31886246d5501b6502457c917fb46e9e748 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 28 Aug 2025 17:46:29 -0700 Subject: [PATCH 34/58] UI: Suppress "Attempted to select a non-interactive or hidden tab" warnings --- js/global_scope_js.js | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/js/global_scope_js.js b/js/global_scope_js.js index ebed1f3d..89b51d67 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -372,3 +372,18 @@ observer.observe(document.documentElement, { subtree: true, attributeFilter: ["style"] }); + +//------------------------------------------------ +// Suppress "Attempted to select a non-interactive or hidden tab" warning +//------------------------------------------------ +(function() { + const originalWarn = console.warn; + + console.warn = function(...args) { + if (args[0] && args[0].includes("Attempted to select a non-interactive or hidden tab")) { + return; + } + + originalWarn.apply(console, args); + }; +})(); From 272095547845a69a725c4564d63f40e50f47e563 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:48:16 -0700 Subject: [PATCH 35/58] Fix a bug after d9eec31886246d5501b6502457c917fb46e9e748 --- js/global_scope_js.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/global_scope_js.js b/js/global_scope_js.js index 89b51d67..d8de2b58 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -380,7 +380,7 @@ observer.observe(document.documentElement, { const originalWarn = console.warn; console.warn = function(...args) { - if (args[0] && args[0].includes("Attempted to select a non-interactive or hidden tab")) { + if (args[0] && typeof args[0] === 'string' && args[0].includes("Attempted to select a non-interactive or hidden tab")) { return; } From fc2eb48664bc8e29034904cac43e6c0bc89aa727 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 28 Aug 2025 20:19:03 -0700 Subject: [PATCH 36/58] Style fixes after 73442a2b6d0f2de333c26cbdde862f3f7b84d8a8 --- css/main.css | 12 ------------ js/show_controls.js | 12 ------------ 2 files changed, 24 deletions(-) diff --git a/css/main.css b/css/main.css index a7ed2534..cde01aa4 100644 --- a/css/main.css +++ b/css/main.css @@ -429,10 +429,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { margin-left: 5px; } -.chat-parent.bigchat { - flex: 1; -} - .chat > .messages { display: flex; flex-direction: column; @@ -832,10 +828,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { padding: 1rem; } -#chat-input-row.bigchat { - padding-bottom: 1px !important; -} - #chat-col { height: 100dvh; display: flex; @@ -851,10 +843,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } } -#chat-col.bigchat { - padding-bottom: 15px !important; -} - .message-body ol, .message-body ul { margin-top: 0 !important; margin-bottom: 1.25em !important; diff --git a/js/show_controls.js b/js/show_controls.js index f974d412..ff513395 100644 --- a/js/show_controls.js +++ b/js/show_controls.js @@ -20,12 +20,6 @@ function toggle_controls(value) { extensions.style.display = "inherit"; } - // Remove bigchat classes - chatParent.classList.remove("bigchat"); - document.getElementById("chat-input-row").classList.remove("bigchat"); - document.getElementById("chat-col").classList.remove("bigchat"); - document.getElementById("chat-tab").style.paddingBottom = ""; - let gallery_element = document.getElementById("gallery-extension"); if (gallery_element) { gallery_element.style.display = "block"; @@ -47,11 +41,5 @@ function toggle_controls(value) { if (extensions) { extensions.style.display = "none"; } - - // Add bigchat classes - chatParent.classList.add("bigchat"); - document.getElementById("chat-input-row").classList.add("bigchat"); - document.getElementById("chat-col").classList.add("bigchat"); - document.getElementById("chat-tab").style.paddingBottom = "0px"; } } From d78b7d0fad31c6b9bf89a89f748e1b00c27c5946 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 28 Aug 2025 20:22:07 -0700 Subject: [PATCH 37/58] Lint --- js/global_scope_js.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/global_scope_js.js b/js/global_scope_js.js index d8de2b58..4d8c1121 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -380,7 +380,7 @@ observer.observe(document.documentElement, { const originalWarn = console.warn; console.warn = function(...args) { - if (args[0] && typeof args[0] === 'string' && args[0].includes("Attempted to select a non-interactive or hidden tab")) { + if (args[0] && typeof args[0] === "string" && args[0].includes("Attempted to select a non-interactive or hidden tab")) { return; } From 084675cf75d08ce2d82fe440abbaf52e429eed3a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 29 Aug 2025 09:11:10 -0700 Subject: [PATCH 38/58] UI: Improve thinking blocks in chat-instruct mode --- css/main.css | 1 + 1 file changed, 1 insertion(+) diff --git a/css/main.css b/css/main.css index cde01aa4..f2793372 100644 --- a/css/main.css +++ b/css/main.css @@ -1354,6 +1354,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { cursor: pointer; user-select: none; font-size: 14px; + line-height: var(--line-sm); color: rgb(0 0 0 / 70%); transition: background-color 0.2s; } From a2b37adb265847c878d71107fca988851090b46f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 29 Aug 2025 09:25:44 -0700 Subject: [PATCH 39/58] UI: Preload the correct fonts for chat mode --- modules/block_requests.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/block_requests.py b/modules/block_requests.py index 618b4bd6..911e41d9 100644 --- a/modules/block_requests.py +++ b/modules/block_requests.py @@ -53,8 +53,9 @@ def my_open(*args, **kwargs): '', '\n ' '\n ' - '\n ' - '\n ' + '\n ' + '\n ' + '\n ' '\n ' '\n ' '\n ' From 07a2e226c165fd5917c778f18c0f0fd4bcef38b7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 29 Aug 2025 14:08:38 -0700 Subject: [PATCH 40/58] UI: Minor font color fixes in instruct mode --- css/html_instruct_style.css | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 22901c4d..6dee0a89 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -14,8 +14,8 @@ } .dark .chat .message-body :is(p,li,h1,h2,h3,h4,h5,h6), -.dark .chat .message-body em:not(:is(h1,h2,h3,h4,h5,h6) em), -.dark .chat .message-body q:not(:is(h1,h2,h3,h4,h5,h6) q) { +.dark .chat .message-body em:not(:is(h1,h2,h3,h4,h5,h6,b,strong) em), +.dark .chat .message-body q:not(:is(h1,h2,h3,h4,h5,h6,b,strong) q) { color: #d1d5db !important; } From 08f90f4b64565424f812e9a0447338c022d883f2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 29 Aug 2025 14:09:04 -0700 Subject: [PATCH 41/58] Lint --- css/main.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/css/main.css b/css/main.css index f2793372..c7ee57da 100644 --- a/css/main.css +++ b/css/main.css @@ -1693,5 +1693,5 @@ button:focus { } .dark .sidebar-vertical-separator { - border-bottom: 1px solid rgba(255,255,255,0.1); + border-bottom: 1px solid rgb(255 255 255 / 10%); } From a3eb67e466f9cc23a6c3607842375e8bf50f2dd0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 08:42:26 -0700 Subject: [PATCH 42/58] Fix the UI failing to launch if the Notebook prompt is too long --- modules/prompts.py | 3 +-- modules/ui_default.py | 3 +-- modules/ui_notebook.py | 3 +-- server.py | 8 ++++++++ 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/modules/prompts.py b/modules/prompts.py index 79d9b56e..b800af91 100644 --- a/modules/prompts.py +++ b/modules/prompts.py @@ -22,8 +22,7 @@ def load_prompt(fname): if file_path.exists(): with open(file_path, 'r', encoding='utf-8') as f: text = f.read() - if len(text) > 0 and text[-1] == '\n': - text = text[:-1] + text = text.rstrip() return text else: diff --git a/modules/ui_default.py b/modules/ui_default.py index 44af48a3..c0feae19 100644 --- a/modules/ui_default.py +++ b/modules/ui_default.py @@ -22,8 +22,7 @@ def create_ui(): with gr.Row(): with gr.Column(): with gr.Row(): - initial_text = load_prompt(shared.settings['prompt-notebook']) - shared.gradio['textbox-default'] = gr.Textbox(value=initial_text, lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar']) + shared.gradio['textbox-default'] = gr.Textbox(value="", lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar']) shared.gradio['token-counter-default'] = gr.HTML(value="0", elem_id="default-token-counter") with gr.Row(): diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py index 939d81f7..9fab879b 100644 --- a/modules/ui_notebook.py +++ b/modules/ui_notebook.py @@ -30,8 +30,7 @@ def create_ui(): with gr.Column(scale=4): with gr.Tab('Raw'): with gr.Row(): - initial_text = load_prompt(shared.settings['prompt-notebook']) - shared.gradio['textbox-notebook'] = gr.Textbox(label="", value=initial_text, lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar']) + shared.gradio['textbox-notebook'] = gr.Textbox(label="", value="", lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar']) shared.gradio['token-counter-notebook'] = gr.HTML(value="0", elem_id="notebook-token-counter") with gr.Tab('Markdown'): diff --git a/server.py b/server.py index 52463a3c..c804c342 100644 --- a/server.py +++ b/server.py @@ -6,6 +6,7 @@ from pathlib import Path from modules import shared from modules.block_requests import OpenMonkeyPatch, RequestBlocker from modules.logging_colors import logger +from modules.prompts import load_prompt # Set up Gradio temp directory path gradio_temp_path = Path('user_data') / 'cache' / 'gradio' @@ -109,6 +110,13 @@ def create_interface(): 'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp' }) + if shared.settings['prompt-notebook']: + prompt = load_prompt(shared.settings['prompt-notebook']) + shared.persistent_interface_state.update({ + 'textbox-default': prompt, + 'textbox-notebook': prompt + }) + # Clear existing cache files for cache_file in ['pfp_character.png', 'pfp_character_thumb.png']: cache_path = Path(f"user_data/cache/{cache_file}") From 96136ea76008cf1fb440050936b6b5bca5bd3d85 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 10:13:32 -0700 Subject: [PATCH 43/58] Fix LaTeX rendering for equations with asterisks --- modules/html_generator.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/modules/html_generator.py b/modules/html_generator.py index 63844f35..9f8c28e5 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -243,6 +243,27 @@ def process_markdown_content(string): if not string: return "" + # Define a unique placeholder for LaTeX asterisks + LATEX_ASTERISK_PLACEHOLDER = "LATEXASTERISKPLACEHOLDER" + + def protect_asterisks_in_latex(match): + """A replacer function for re.sub to protect asterisks in multiple LaTeX formats.""" + # Check which delimiter group was captured + if match.group(1) is not None: # Content from $$...$$ + content = match.group(1) + modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER) + return f'$${modified_content}$$' + elif match.group(2) is not None: # Content from \[...\] + content = match.group(2) + modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER) + return f'\\[{modified_content}\\]' + elif match.group(3) is not None: # Content from \(...\) + content = match.group(3) + modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER) + return f'\\({modified_content}\\)' + + return match.group(0) # Fallback + # Make \[ \] LaTeX equations inline pattern = r'^\s*\\\[\s*\n([\s\S]*?)\n\s*\\\]\s*$' replacement = r'\\[ \1 \\]' @@ -272,6 +293,10 @@ def process_markdown_content(string): string = string.replace('\\end{equation*}', '$$') string = re.sub(r"(.)```", r"\1\n```", string) + # Protect asterisks within all LaTeX blocks before markdown conversion + latex_pattern = re.compile(r'\$\$(.*?)\$\$|\\\[(.*?)\\\]|\\\((.*?)\\\)', re.DOTALL) + string = latex_pattern.sub(protect_asterisks_in_latex, string) + result = '' is_code = False is_latex = False @@ -330,6 +355,9 @@ def process_markdown_content(string): # Convert to HTML using markdown html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()]) + # Restore the LaTeX asterisks after markdown conversion + html_output = html_output.replace(LATEX_ASTERISK_PLACEHOLDER, '*') + # Remove extra newlines before html_output = re.sub(r'\s*', '', html_output) From cf1aad2a687622358e121497c30508e960267662 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 12:16:45 -0700 Subject: [PATCH 44/58] Fix "continue" for Byte-OSS for partial thinking blocks --- modules/chat.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 3c61a0dd..6d85bc6e 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -333,7 +333,23 @@ def generate_chat_prompt(user_input, state, **kwargs): if _continue: prompt = prompt.split("fake assistant message replace me", 1)[0] - prompt += last_message.get("content", "") + content = last_message.get("content", "") + thinking = last_message.get("thinking", "") + reasoning = last_message.get("reasoning_content", "") + + partial_thought = thinking or reasoning + # Handle partial thinking blocks (GPT-OSS and Seed-OSS) + if partial_thought and partial_thought.strip(): + search_string = partial_thought.strip() + index = prompt.rfind(search_string) + if index != -1: + prompt = prompt[:index] + partial_thought + else: + # Fallback + prompt += content + else: + # All other cases + prompt += content if impersonate: prompt = prompt.split("fake user message replace me", 1)[0] From 3a3e247f3cb21ec7c13bcfa9f21757e216d5c7ec Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 12:36:35 -0700 Subject: [PATCH 45/58] Even better way to handle continue for thinking blocks --- modules/chat.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 6d85bc6e..ad2f4001 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -333,20 +333,19 @@ def generate_chat_prompt(user_input, state, **kwargs): if _continue: prompt = prompt.split("fake assistant message replace me", 1)[0] - content = last_message.get("content", "") - thinking = last_message.get("thinking", "") - reasoning = last_message.get("reasoning_content", "") - partial_thought = thinking or reasoning + content = last_message.get("content", "") + partial_thought = last_message.get("thinking", "") or last_message.get("reasoning_content", "") + # Handle partial thinking blocks (GPT-OSS and Seed-OSS) - if partial_thought and partial_thought.strip(): + if not content and partial_thought and partial_thought.strip(): search_string = partial_thought.strip() index = prompt.rfind(search_string) if index != -1: prompt = prompt[:index] + partial_thought else: - # Fallback - prompt += content + # Fallback if search fails: just append the thought + prompt += partial_thought else: # All other cases prompt += content From 21d790f87ea21fc1f7f9f938621d805b21564493 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 14:48:07 -0700 Subject: [PATCH 46/58] Optimize LaTeX rendering during streaming for long replies --- js/main.js | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/js/main.js b/js/main.js index 9b9d685a..c9ee6284 100644 --- a/js/main.js +++ b/js/main.js @@ -260,13 +260,19 @@ function doSyntaxHighlighting() { codeBlock.classList.add("pretty_scrollbar"); }); - renderMathInElement(messageBody, { - delimiters: [ - { left: "$$", right: "$$", display: true }, - { left: "$", right: "$", display: false }, - { left: "\\(", right: "\\)", display: false }, - { left: "\\[", right: "\\]", display: true }, - ], + // Only render math in visible elements + const mathContainers = messageBody.querySelectorAll("p, div, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt"); + mathContainers.forEach(container => { + if (isElementVisibleOnScreen(container)) { + renderMathInElement(container, { + delimiters: [ + { left: "$$", right: "$$", display: true }, + { left: "$", right: "$", display: false }, + { left: "\\(", right: "\\)", display: false }, + { left: "\\[", right: "\\]", display: true }, + ], + }); + } }); } else if (hasSeenVisible) { // We've seen visible messages but this one is not visible From 5920ad8834dd8d3077376636c8347e7547bb6a04 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 15:22:50 -0700 Subject: [PATCH 47/58] UI: Give streaming instruct messages more vertical space --- js/main.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/main.js b/js/main.js index c9ee6284..7246ca87 100644 --- a/js/main.js +++ b/js/main.js @@ -206,7 +206,7 @@ const observer = new MutationObserver(function(mutations) { // Add padding to the messages container to create room for the last message. // The purpose of this is to avoid constant scrolling during streaming in // instruct mode. - const bufferHeight = Math.max(0, Math.max(0.7 * window.innerHeight, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight); + const bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight); messagesContainer.style.paddingBottom = `${bufferHeight}px`; } } From 5631d4e3d69a7d3e77899efcd71b8e4860cfa346 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 15:34:49 -0700 Subject: [PATCH 48/58] Minor change after 21d790f87ea21fc1f7f9f938621d805b21564493 --- js/main.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/main.js b/js/main.js index 7246ca87..c08dffcf 100644 --- a/js/main.js +++ b/js/main.js @@ -261,7 +261,7 @@ function doSyntaxHighlighting() { }); // Only render math in visible elements - const mathContainers = messageBody.querySelectorAll("p, div, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt"); + const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt"); mathContainers.forEach(container => { if (isElementVisibleOnScreen(container)) { renderMathInElement(container, { From 7b80e9a2ad0cb12ac732bd2bcf7bc0bd1cb3a0e6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 20:22:11 -0700 Subject: [PATCH 49/58] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 77ddc8fb..b9b7dcbe 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -35,8 +35,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 802f6724..fb0ee8f8 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -34,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index bbb4fa59..080615e3 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -34,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index b721bcce..6b8181a7 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -34,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 80b168d2..5f44da75 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 5bfcdea6..aa82f50c 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -34,5 +34,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 31743a21..452ad801 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -34,5 +34,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 0d04d229..2bd992fc 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -35,8 +35,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index ca0c4017..c170d1e2 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index b5a853ba..2d71b660 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 995f5f26..3a9b79e1 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -19,6 +19,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 3b5d9442..202b726d 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 4bc705ec..3c1a14bd 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index a4dc4de9..6ba0abcc 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 4367e180..cd94fb8c 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 2130efcc..51727f2d 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.39.0/llama_cpp_binaries-0.39.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 13876a1ee8b0d40ab54f4683b02c4534543f8aa8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 20:27:32 -0700 Subject: [PATCH 50/58] llama.cpp: Remove the --flash-attn flag (it's always on now) --- modules/llama_cpp_server.py | 2 -- modules/loaders.py | 1 - modules/shared.py | 4 ---- modules/ui.py | 1 - modules/ui_model_menu.py | 1 - 5 files changed, 9 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 8579f843..6a094c9d 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -318,8 +318,6 @@ class LlamaServer: "--no-webui", ] - if shared.args.flash_attn: - cmd.append("--flash-attn") if shared.args.threads > 0: cmd += ["--threads", str(shared.args.threads)] if shared.args.threads_batch > 0: diff --git a/modules/loaders.py b/modules/loaders.py index f88e976d..fe982ab5 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -16,7 +16,6 @@ loaders_and_params = OrderedDict({ 'streaming_llm', 'rope_freq_base', 'compress_pos_emb', - 'flash_attn', 'row_split', 'no_kv_offload', 'no_mmap', diff --git a/modules/shared.py b/modules/shared.py index a3085239..4daf43c9 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -73,7 +73,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for # llama.cpp group = parser.add_argument_group('llama.cpp') -group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.') group.add_argument('--threads', type=int, default=0, help='Number of threads to use.') group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') @@ -159,9 +158,6 @@ group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 f group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API') group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.') -# Deprecated parameters -group = parser.add_argument_group('Deprecated') - # Handle CMD_FLAGS.txt cmd_flags_path = Path(__file__).parent.parent / "user_data" / "CMD_FLAGS.txt" if cmd_flags_path.exists(): diff --git a/modules/ui.py b/modules/ui.py index 502005e7..12f43768 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -142,7 +142,6 @@ def list_model_elements(): 'num_experts_per_token', 'load_in_8bit', 'load_in_4bit', - 'flash_attn', 'attn_implementation', 'cpu', 'disk', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index dd240627..729700d4 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -50,7 +50,6 @@ def create_ui(): with gr.Column(): shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) - shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) From 8028d8854122887616a5a5322704904fffa98a93 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 21:29:20 -0700 Subject: [PATCH 51/58] Lint --- modules/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/models.py b/modules/models.py index 133131d7..d2b9cc98 100644 --- a/modules/models.py +++ b/modules/models.py @@ -1,6 +1,5 @@ import sys import time -from pathlib import Path import modules.shared as shared from modules.logging_colors import logger From 387e249decfb8ca7e119e5971da11d3605e7e3e3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Aug 2025 21:31:27 -0700 Subject: [PATCH 52/58] Change an info message --- modules/ui_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 1e8218a9..7c388607 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -92,7 +92,7 @@ def create_ui(): gr.HTML("") with gr.Row(): - shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') + shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='In instruct and chat-instruct modes, the template under Parameters > Instruction template is used.', elem_id='chat-mode') with gr.Row(): shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct') From 00ebb295d32cc89da239f62b68a15bb1ae4bc636 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 31 Aug 2025 16:27:23 -0700 Subject: [PATCH 53/58] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index b9b7dcbe..9cf069c7 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -35,8 +35,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index fb0ee8f8..81434cc8 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -34,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 080615e3..bf547be7 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -34,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 6b8181a7..64dfcab7 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -34,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 5f44da75..cb29ad4e 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index aa82f50c..e0a10782 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -34,5 +34,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 452ad801..5b6a4bf4 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -34,5 +34,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 2bd992fc..39c6f768 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -35,8 +35,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index c170d1e2..8407fa29 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 2d71b660..3d2be6de 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 3a9b79e1..2bf635b3 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -19,6 +19,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 202b726d..3b9fc16f 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 3c1a14bd..e4d2900d 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 6ba0abcc..5b492b42 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index cd94fb8c..90e7d38b 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 51727f2d..fe21a1c7 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.40.0/llama_cpp_binaries-0.40.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From d843afcf66afeea23c941d6418d63233d0702d2e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 2 Sep 2025 05:43:33 -0700 Subject: [PATCH 54/58] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 9cf069c7..3a3b899c 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -35,8 +35,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 81434cc8..388da65c 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -34,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index bf547be7..d1635779 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -34,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 64dfcab7..dde8d4a1 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -34,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index cb29ad4e..9b1776ca 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index e0a10782..17d907bc 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -34,5 +34,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 5b6a4bf4..8c095428 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -34,5 +34,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 39c6f768..553e8cfb 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -35,8 +35,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 8407fa29..e77ce7b1 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 3d2be6de..dc45ef37 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 2bf635b3..541f96d4 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -19,6 +19,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 3b9fc16f..2af3b4b9 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index e4d2900d..6a5f5740 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 5b492b42..a7f2405b 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 90e7d38b..bb2b0f28 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index fe21a1c7..404f1267 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -19,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.41.0/llama_cpp_binaries-0.41.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 00ed878b054df15311be871381b09a0a8ecd1135 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 2 Sep 2025 10:16:26 -0700 Subject: [PATCH 55/58] Slightly more robust model loading --- modules/models.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/models.py b/modules/models.py index d2b9cc98..9535ea82 100644 --- a/modules/models.py +++ b/modules/models.py @@ -45,12 +45,13 @@ def load_model(model_name, loader=None): model, tokenizer = output else: model = output - if model is None: - return None, None - else: + if model is not None: from modules.transformers_loader import load_tokenizer tokenizer = load_tokenizer(model_name) + if model is None: + return None, None + shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp': shared.settings['truncation_length'] = shared.args.ctx_size From c6ea67bbdbb4b1f7de7d6f0a8d6909c54c62c348 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 2 Sep 2025 10:22:03 -0700 Subject: [PATCH 56/58] Lint --- modules/html_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/html_generator.py b/modules/html_generator.py index 9f8c28e5..492b52bd 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -262,7 +262,7 @@ def process_markdown_content(string): modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER) return f'\\({modified_content}\\)' - return match.group(0) # Fallback + return match.group(0) # Fallback # Make \[ \] LaTeX equations inline pattern = r'^\s*\\\[\s*\n([\s\S]*?)\n\s*\\\]\s*$' From 2395c647d45769fe8c440d75219fb838a74869e3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 2 Sep 2025 12:11:15 -0700 Subject: [PATCH 57/58] Fix the instruct message height on mobile --- js/main.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/js/main.js b/js/main.js index c08dffcf..c31621f6 100644 --- a/js/main.js +++ b/js/main.js @@ -206,7 +206,13 @@ const observer = new MutationObserver(function(mutations) { // Add padding to the messages container to create room for the last message. // The purpose of this is to avoid constant scrolling during streaming in // instruct mode. - const bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight); + let bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight); + + // Subtract header height when screen width is <= 924px + if (window.innerWidth <= 924) { + bufferHeight = Math.max(0, bufferHeight - 32); + } + messagesContainer.style.paddingBottom = `${bufferHeight}px`; } } From f3829b268a870c8113dc4146a13e5d9e07fd1aea Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 2 Sep 2025 12:12:17 -0700 Subject: [PATCH 58/58] llama.cpp: Always pass --flash-attn on --- modules/llama_cpp_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 6a094c9d..38589cf2 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -316,6 +316,7 @@ class LlamaServer: "--batch-size", str(shared.args.batch_size), "--port", str(self.port), "--no-webui", + "--flash-attn", "on", ] if shared.args.threads > 0: