Merge pull request #7366 from oobabooga/dev

Merge dev branch
Update llama.cpp
2026-03-03 12:04:28 +01:00 · 2026-01-08 17:54:12 -03:00 · 2026-01-08 11:24:15 -08:00 · 2026-01-07 19:06:23 -08:00 · 2026-01-06 15:27:23 -03:00 · 2026-01-06 15:27:10 -03:00
24 changed files with 109 additions and 104 deletions
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@ -19,12 +19,14 @@
    color: #d1d5db !important;
 }
-.chat .message-body :is(th, td) {
+.chat .message-body :is(th, td),
 .prose hr {
    border-color: #40404096 !important;
 }
-.dark .chat .message-body :is(th, td) {
+.dark .chat .message-body :is(th, td),
-    border-color: #ffffff75 !important;
+.dark .prose hr {
    border-color: rgb(255 255 255 / 30%) !important;
 }
 .chat .message-body :is(p, ul, ol) {
--- a/css/main.css
+++ b/css/main.css
@ -1797,3 +1797,20 @@ button#swap-height-width {
    top: 0;
    left: calc(100% - 174px);
 }
 table {
    border-collapse: collapse;
 }
 table, tr, td, th, thead {
    border: 0;
 }
 td + td,
 th + th { border-left: 1px solid; }
 tr + tr td,
 tr + tr th { border-top: 1px solid; }
 thead + tbody tr:first-child td,
 thead + tbody tr:first-child th { border-top: 1px solid; }
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -83,7 +83,11 @@ def get_model_metadata(model):
        if 'tokenizer.chat_template' in metadata:
            template = metadata['tokenizer.chat_template']
-            eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']]
+            if 'tokenizer.ggml.eos_token_id' in metadata:
                eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']]
            else:
                eos_token = ""
            if 'tokenizer.ggml.bos_token_id' in metadata:
                bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']]
            else:
--- a/modules/shared.py
+++ b/modules/shared.py
@ -112,7 +112,7 @@ group.add_argument('--no-cache', action='store_true', help='Set use_cache to Fal
 group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
 group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
 group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
-group.add_argument('--attn-implementation', type=str, default='flash_attention_2', metavar="IMPLEMENTATION", help='Attention implementation. Valid options: flash_attention_2, sdpa, eager.')
+group.add_argument('--attn-implementation', type=str, default='sdpa', metavar="IMPLEMENTATION", help='Attention implementation. Valid options: sdpa, eager, flash_attention_2.')
 # bitsandbytes 4-bit
 group = parser.add_argument_group('bitsandbytes 4-bit')
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -44,7 +44,7 @@ def create_ui():
                            shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                            shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['flash_attention_2', 'sdpa', 'eager'], value=shared.args.attn_implementation, info='Attention implementation.')
+                            shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
                            shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
                            shared.gradio['tp_backend'] = gr.Dropdown(label="tp-backend", choices=['native', 'nccl'], value=shared.args.tp_backend, info='The backend for tensor parallelism.')
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@ -1,11 +1,12 @@
 accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
-bitsandbytes==0.48.*
+bitsandbytes==0.49.*
 colorama
 datasets
 diffusers==0.36.*
 einops
 fastapi==0.112.4
-flash-linear-attention==0.4.0
+flash-linear-attention==0.4.*
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
@ -21,13 +22,13 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.6.*
+safetensors==0.7.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
+torchao==0.15.*
 transformers==4.57.*
-triton-windows==3.5.1.post21; platform_system == "Windows"
+triton-windows==3.5.1.post22; platform_system == "Windows"
 tqdm
 wandb
@ -35,19 +36,16 @@ wandb
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # Diffusers
 diffusers @ git+https://github.com/huggingface/diffusers.git@edf36f5128abf3e6ecf92b5145115514363c58e6
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.17/exllamav3-0.0.17+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.18/exllamav3-0.0.18+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.17/exllamav3-0.0.17+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.18/exllamav3-0.0.18+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@ -2,6 +2,7 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
 diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
@ -19,13 +20,13 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.6.*
+safetensors==0.7.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
+torchao==0.15.*
 transformers==4.57.*
-triton-windows==3.5.1.post21; platform_system == "Windows"
+triton-windows==3.5.1.post22; platform_system == "Windows"
 tqdm
 wandb
@ -33,16 +34,13 @@ wandb
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # Diffusers
 diffusers @ git+https://github.com/huggingface/diffusers.git@edf36f5128abf3e6ecf92b5145115514363c58e6
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@ -2,6 +2,7 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
 diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
@ -19,13 +20,13 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.6.*
+safetensors==0.7.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
+torchao==0.15.*
 transformers==4.57.*
-triton-windows==3.5.1.post21; platform_system == "Windows"
+triton-windows==3.5.1.post22; platform_system == "Windows"
 tqdm
 wandb
@ -33,16 +34,13 @@ wandb
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # Diffusers
 diffusers @ git+https://github.com/huggingface/diffusers.git@edf36f5128abf3e6ecf92b5145115514363c58e6
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@ -2,6 +2,7 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
 diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
@ -19,13 +20,13 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.6.*
+safetensors==0.7.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
+torchao==0.15.*
 transformers==4.57.*
-triton-windows==3.5.1.post21; platform_system == "Windows"
+triton-windows==3.5.1.post22; platform_system == "Windows"
 tqdm
 wandb
@ -33,14 +34,11 @@ wandb
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # Diffusers
 diffusers @ git+https://github.com/huggingface/diffusers.git@edf36f5128abf3e6ecf92b5145115514363c58e6
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@ -2,6 +2,7 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
 diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
@ -19,13 +20,13 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.6.*
+safetensors==0.7.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
+torchao==0.15.*
 transformers==4.57.*
-triton-windows==3.5.1.post21; platform_system == "Windows"
+triton-windows==3.5.1.post22; platform_system == "Windows"
 tqdm
 wandb
@ -33,14 +34,11 @@ wandb
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # Diffusers
 diffusers @ git+https://github.com/huggingface/diffusers.git@edf36f5128abf3e6ecf92b5145115514363c58e6
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@ -2,6 +2,7 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
 diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
@ -19,13 +20,13 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.6.*
+safetensors==0.7.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
+torchao==0.15.*
 transformers==4.57.*
-triton-windows==3.5.1.post21; platform_system == "Windows"
+triton-windows==3.5.1.post22; platform_system == "Windows"
 tqdm
 wandb
@ -33,14 +34,11 @@ wandb
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # Diffusers
 diffusers @ git+https://github.com/huggingface/diffusers.git@edf36f5128abf3e6ecf92b5145115514363c58e6
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@ -2,6 +2,7 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
 diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
@ -19,13 +20,13 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.6.*
+safetensors==0.7.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
+torchao==0.15.*
 transformers==4.57.*
-triton-windows==3.5.1.post21; platform_system == "Windows"
+triton-windows==3.5.1.post22; platform_system == "Windows"
 tqdm
 wandb
@ -33,14 +34,11 @@ wandb
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # Diffusers
 diffusers @ git+https://github.com/huggingface/diffusers.git@edf36f5128abf3e6ecf92b5145115514363c58e6
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@ -1,11 +1,12 @@
 accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
-bitsandbytes==0.48.*
+bitsandbytes==0.49.*
 colorama
 datasets
 diffusers==0.36.*
 einops
 fastapi==0.112.4
-flash-linear-attention==0.4.0
+flash-linear-attention==0.4.*
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
@ -21,13 +22,13 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.6.*
+safetensors==0.7.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
+torchao==0.15.*
 transformers==4.57.*
-triton-windows==3.5.1.post21; platform_system == "Windows"
+triton-windows==3.5.1.post22; platform_system == "Windows"
 tqdm
 wandb
@ -35,19 +36,16 @@ wandb
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # Diffusers
 diffusers @ git+https://github.com/huggingface/diffusers.git@edf36f5128abf3e6ecf92b5145115514363c58e6
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.17/exllamav3-0.0.17+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.18/exllamav3-0.0.18+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.17/exllamav3-0.0.17+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.18/exllamav3-0.0.18+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@ -2,6 +2,7 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
 diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
@ -19,13 +20,13 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.6.*
+safetensors==0.7.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
+torchao==0.15.*
 transformers==4.57.*
-triton-windows==3.5.1.post21; platform_system == "Windows"
+triton-windows==3.5.1.post22; platform_system == "Windows"
 tqdm
 wandb
@ -33,9 +34,6 @@ wandb
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # Diffusers
 diffusers @ git+https://github.com/huggingface/diffusers.git@edf36f5128abf3e6ecf92b5145115514363c58e6
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.66.0/llama_cpp_binaries-0.66.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.74.0/llama_cpp_binaries-0.74.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
Author	SHA1	Message	Date
oobabooga	910456ba31	Merge pull request #7366 from oobabooga/dev Merge dev branch	2026-01-08 17:54:12 -03:00
oobabooga	d79cdc614c	Update llama.cpp	2026-01-08 11:24:15 -08:00
oobabooga	332fd40653	Update llama.cpp	2026-01-07 19:06:23 -08:00
dependabot[bot]	50a35b483c	Update bitsandbytes requirement in /requirements/full (#7353 )	2026-01-06 15:27:23 -03:00
dependabot[bot]	45fbec0320	Update torchao requirement in /requirements/full (#7356 )	2026-01-06 15:27:10 -03:00
oobabooga	b0968ed8b4	Update flash-linear-attention	2026-01-06 10:26:43 -08:00
oobabooga	36747cf99c	Lint	2026-01-06 10:24:34 -08:00
oobabooga	2fcbadec67	Merge remote-tracking branch 'refs/remotes/origin/dev' into dev	2026-01-06 10:24:07 -08:00
oobabooga	bb3b7bc197	Update llama.cpp	2026-01-06 10:23:58 -08:00
Sergey 'Jin' Bostandzhyan	6e2c4e9c23	Fix loading models which have their eos token disabled (#7363 )	2026-01-06 11:31:10 -03:00
oobabooga	a2ed640aa6	UI: Improved border color for tables + hr	2025-12-21 15:38:48 -03:00
oobabooga	1066fe8c21	UI: Improve table styles (more minimalistic)	2025-12-21 15:32:02 -03:00
oobabooga	9530d3a6d8	UI: Improve hr (horizontal separator) style	2025-12-21 15:30:54 -03:00
oobabooga	a0b5599e9b	Merge pull request #7355 from oobabooga/dev Merge dev branch	2025-12-20 02:18:31 -03:00
oobabooga	09d88f91e8	Update llama.cpp	2025-12-19 21:00:13 -08:00
oobabooga	34804f9354	Merge pull request #7352 from oobabooga/dev Merge dev branch	2025-12-14 22:59:34 -03:00
oobabooga	6e8fb0e7b1	Update llama.cpp	2025-12-14 13:32:14 -08:00
oobabooga	9fe40ff90f	Update exllamav3 to 0.0.18	2025-12-10 05:37:33 -08:00
oobabooga	8e762e04b4	Merge remote-tracking branch 'refs/remotes/origin/dev' into dev	2025-12-09 05:27:43 -08:00
oobabooga	aa16266c38	Update llama.cpp	2025-12-09 03:19:23 -08:00
dependabot[bot]	85269d7fbb	Update safetensors requirement in /requirements/full (#7323 )	2025-12-08 17:58:27 -03:00
dependabot[bot]	c4ebab9b29	Bump triton-windows in /requirements/full (#7346 )	2025-12-08 17:56:07 -03:00
oobabooga	bb004bacb1	Merge pull request #7345 from oobabooga/dev Merge dev branch	2025-12-08 10:14:49 -03:00
oobabooga	502f59d39b	Update diffusers to 0.36	2025-12-08 05:08:54 -08:00
oobabooga	4d94f66832	Merge pull request #7343 from oobabooga/dev Merge dev branch	2025-12-07 23:49:19 -03:00
oobabooga	e7c8b51fec	Revert "Use flash_attention_2 by default for Transformers models" This reverts commit `85f2df92e9`.	2025-12-07 18:48:41 -08:00