From 6c2bdda0f02f497ee91f8c13afab7de149823ff0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 9 Jul 2025 18:38:45 -0700 Subject: [PATCH 01/23] Transformers loader: replace `use_flash_attention_2`/`use_eager_attention` with a unified `attn_implementation` Closes #7107 --- modules/loaders.py | 3 +-- modules/models_settings.py | 6 ------ modules/shared.py | 3 +-- modules/transformers_loader.py | 7 +------ modules/ui.py | 3 +-- modules/ui_model_menu.py | 3 +-- 6 files changed, 5 insertions(+), 20 deletions(-) diff --git a/modules/loaders.py b/modules/loaders.py index 6fbd2198..2b2c6b78 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -40,11 +40,10 @@ loaders_and_params = OrderedDict({ 'load_in_8bit', 'load_in_4bit', 'torch_compile', - 'use_flash_attention_2', + 'attn_implementation', 'cpu', 'disk', 'use_double_quant', - 'use_eager_attention', 'bf16', 'trust_remote_code', 'no_use_fast', diff --git a/modules/models_settings.py b/modules/models_settings.py index bea5b4d6..a06e594e 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -15,7 +15,6 @@ from modules.logging_colors import logger def get_fallback_settings(): return { 'bf16': False, - 'use_eager_attention': False, 'ctx_size': 2048, 'rope_freq_base': 0, 'compress_pos_emb': 1, @@ -118,14 +117,9 @@ def get_model_metadata(model): if metadata['rope_scaling']['type'] == 'linear': model_settings['compress_pos_emb'] = metadata['rope_scaling']['factor'] - # For Gemma-2 if 'torch_dtype' in metadata and metadata['torch_dtype'] == 'bfloat16': model_settings['bf16'] = True - # For Gemma-2 - if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']: - model_settings['use_eager_attention'] = True - # Try to find the Jinja instruct template path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json' if path.exists(): diff --git a/modules/shared.py b/modules/shared.py index 5333ec4f..57649114 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -61,8 +61,7 @@ group.add_argument('--no-cache', action='store_true', help='Set use_cache to Fal group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.') group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.') -group.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.') -group.add_argument('--use_eager_attention', action='store_true', help='Set attn_implementation= eager while loading the model.') +group.add_argument('--attn-implementation', type=str, default='sdpa', help='Attention implementation. Valid options: sdpa, eager, flash_attention_2.') group.add_argument('--torch-compile', action='store_true', help='Compile the model with torch.compile for improved performance.') # bitsandbytes 4-bit diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py index 905f5c47..ef524b57 100644 --- a/modules/transformers_loader.py +++ b/modules/transformers_loader.py @@ -135,20 +135,15 @@ def load_model_HF(model_name): params = { 'low_cpu_mem_usage': True, 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16, + 'attn_implementation': shared.args.attn_implementation, } if shared.args.trust_remote_code: params['trust_remote_code'] = True - if shared.args.use_flash_attention_2: - params['use_flash_attention_2'] = True - if shared.args.force_safetensors: params['force_safetensors'] = True - if shared.args.use_eager_attention: - params['attn_implementation'] = 'eager' - config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) if 'chatglm' in model_name.lower(): diff --git a/modules/ui.py b/modules/ui.py index 0030bb02..9d6777f0 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -144,7 +144,7 @@ def list_model_elements(): 'load_in_4bit', 'torch_compile', 'flash_attn', - 'use_flash_attention_2', + 'attn_implementation', 'cpu', 'disk', 'row_split', @@ -153,7 +153,6 @@ def list_model_elements(): 'mlock', 'numa', 'use_double_quant', - 'use_eager_attention', 'bf16', 'autosplit', 'enable_tp', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index e09e292e..2018a943 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -44,6 +44,7 @@ def create_ui(): shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') + shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') with gr.Column(): shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) @@ -52,7 +53,6 @@ def create_ui(): shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.') - shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.') shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).') @@ -96,7 +96,6 @@ def create_ui(): shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.') - shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.') shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn) shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers) From c357601c01b52dbfec0670b377b7d60132e85c27 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 9 Jul 2025 18:48:04 -0700 Subject: [PATCH 02/23] Bump transformers to 4.53 --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- requirements/full/requirements_cpu_only_noavx2.txt | 2 +- requirements/full/requirements_cuda128.txt | 2 +- requirements/full/requirements_cuda128_noavx2.txt | 2 +- requirements/full/requirements_noavx2.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 34fd13be..6f3b816e 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 9b18524a..7eba3671 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 6b9cc2a9..6d6e990d 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index b7830355..40077466 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index e48d4dee..c8744377 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 831ff51f..d77dfd1c 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index fb81f40b..b1bfc351 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 3d29c93f..58a246aa 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 7528aed0..b3859a51 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index e86e4524..9e1a46e6 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 30020989..f3d2b08d 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.53.* tqdm wandb From f045b72826820167d475aa2e6cac226485b586ef Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 9 Jul 2025 19:46:26 -0700 Subject: [PATCH 03/23] Bump accelerate to 1.8 --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- requirements/full/requirements_cpu_only_noavx2.txt | 2 +- requirements/full/requirements_cuda128.txt | 2 +- requirements/full/requirements_cuda128_noavx2.txt | 2 +- requirements/full/requirements_noavx2.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 6f3b816e..34f1f8d1 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* bitsandbytes==0.45.* colorama datasets diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 7eba3671..af117c5c 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* colorama datasets einops diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 6d6e990d..8095c38c 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* colorama datasets einops diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 40077466..d63207b6 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* colorama datasets einops diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index c8744377..a2add551 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* colorama datasets einops diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index d77dfd1c..ec30e05d 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* colorama datasets einops diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index b1bfc351..d7cf75db 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* colorama datasets einops diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 58a246aa..1db5c928 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* bitsandbytes==0.45.* colorama datasets diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index b3859a51..9eef3412 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* bitsandbytes==0.45.* colorama datasets diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 9e1a46e6..52a3068a 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* bitsandbytes==0.45.* colorama datasets diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index f3d2b08d..4f35ffcc 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -1,4 +1,4 @@ -accelerate==1.5.* +accelerate==1.8.* colorama datasets einops From 8b3c7aa7952fdf92ae20c4f66e12df85ed8bcf21 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 9 Jul 2025 19:46:55 -0700 Subject: [PATCH 04/23] Bump bitsandbytes to 0.46 --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_cuda128.txt | 2 +- requirements/full/requirements_cuda128_noavx2.txt | 2 +- requirements/full/requirements_noavx2.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 34f1f8d1..791a45eb 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -1,5 +1,5 @@ accelerate==1.8.* -bitsandbytes==0.45.* +bitsandbytes==0.46.* colorama datasets einops diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 1db5c928..dd5c08ac 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -1,5 +1,5 @@ accelerate==1.8.* -bitsandbytes==0.45.* +bitsandbytes==0.46.* colorama datasets einops diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 9eef3412..17dd31fd 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -1,5 +1,5 @@ accelerate==1.8.* -bitsandbytes==0.45.* +bitsandbytes==0.46.* colorama datasets einops diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 52a3068a..be914200 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -1,5 +1,5 @@ accelerate==1.8.* -bitsandbytes==0.45.* +bitsandbytes==0.46.* colorama datasets einops From b69f435311e47949c6341e298f4b8db396674926 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 9 Jul 2025 19:56:50 -0700 Subject: [PATCH 05/23] Fix latest transformers being super slow --- modules/transformers_loader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py index ef524b57..e4163b6d 100644 --- a/modules/transformers_loader.py +++ b/modules/transformers_loader.py @@ -131,6 +131,8 @@ def load_tokenizer(model_name, tokenizer_dir=None): def load_model_HF(model_name): + torch._dynamo.config.disable = True + path_to_model = Path(f'{shared.args.model_dir}/{model_name}') params = { 'low_cpu_mem_usage': True, From bd4881c4dcead542cd8abf7fc2020b259b8f7af6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 9 Jul 2025 19:57:37 -0700 Subject: [PATCH 06/23] Use eager attention by default instead of sdpa --- modules/shared.py | 2 +- modules/ui_model_menu.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index 57649114..b13156b7 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -61,7 +61,7 @@ group.add_argument('--no-cache', action='store_true', help='Set use_cache to Fal group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.') group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.') -group.add_argument('--attn-implementation', type=str, default='sdpa', help='Attention implementation. Valid options: sdpa, eager, flash_attention_2.') +group.add_argument('--attn-implementation', type=str, default='eager', help='Attention implementation. Valid options: eager, sdpa, flash_attention_2.') group.add_argument('--torch-compile', action='store_true', help='Compile the model with torch.compile for improved performance.') # bitsandbytes 4-bit diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 2018a943..365ca7e7 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -44,7 +44,7 @@ def create_ui(): shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') - shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.') + shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['eager', 'sdpa', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') with gr.Column(): shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) From e015355e4a28e365fa027271c67bd366220bbe52 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 9 Jul 2025 20:03:53 -0700 Subject: [PATCH 07/23] Update README --- README.md | 5 ++--- modules/shared.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c80dacde..2f7fe4f5 100644 --- a/README.md +++ b/README.md @@ -235,7 +235,7 @@ List of command-line flags ```txt usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS] [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] - [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit] + [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--torch-compile] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N] [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT] @@ -278,8 +278,7 @@ Transformers/Accelerate: --trust-remote-code Set trust_remote_code=True while loading the model. Necessary for some models. --force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution. --no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. - --use_flash_attention_2 Set use_flash_attention_2=True while loading the model. - --use_eager_attention Set attn_implementation= eager while loading the model. + --attn-implementation IMPLEMENTATION Attention implementation. Valid options: eager, sdpa, flash_attention_2. --torch-compile Compile the model with torch.compile for improved performance. bitsandbytes 4-bit: diff --git a/modules/shared.py b/modules/shared.py index b13156b7..0348d41e 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -61,7 +61,7 @@ group.add_argument('--no-cache', action='store_true', help='Set use_cache to Fal group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.') group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.') -group.add_argument('--attn-implementation', type=str, default='eager', help='Attention implementation. Valid options: eager, sdpa, flash_attention_2.') +group.add_argument('--attn-implementation', type=str, default='eager', metavar="IMPLEMENTATION", help='Attention implementation. Valid options: eager, sdpa, flash_attention_2.') group.add_argument('--torch-compile', action='store_true', help='Compile the model with torch.compile for improved performance.') # bitsandbytes 4-bit From d1f4622a9666ce9b89e6f7796e6d8a3d33e4209b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 10 Jul 2025 00:15:50 -0300 Subject: [PATCH 08/23] Update peft requirement from ==0.15.* to ==0.16.* in /requirements/full (#7127) --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- requirements/full/requirements_cpu_only_noavx2.txt | 2 +- requirements/full/requirements_cuda128.txt | 2 +- requirements/full/requirements_cuda128_noavx2.txt | 2 +- requirements/full/requirements_noavx2.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 791a45eb..b2d0f98a 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -10,7 +10,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index af117c5c..e33a831d 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -9,7 +9,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 8095c38c..3302afe5 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -9,7 +9,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index d63207b6..319dbc57 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -9,7 +9,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index a2add551..2653c682 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -9,7 +9,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index ec30e05d..ad09c53f 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -9,7 +9,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index d7cf75db..c2ad7765 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -9,7 +9,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index dd5c08ac..e528b0ef 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -10,7 +10,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 17dd31fd..28b16740 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -10,7 +10,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index be914200..7e0fdaac 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -10,7 +10,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 4f35ffcc..ee5346e6 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -9,7 +9,7 @@ jinja2==3.1.6 markdown numpy==2.2.* pandas -peft==0.15.* +peft==0.16.* Pillow>=9.5.0 psutil pydantic==2.8.2 From 21e0e9f32bf7ed7989538c8b8f0755ae3f4d57c9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 9 Jul 2025 21:05:17 -0700 Subject: [PATCH 09/23] Add the triton-windows requirement on Windows to make transformers functional --- requirements/full/requirements.txt | 1 + requirements/full/requirements_amd.txt | 1 + requirements/full/requirements_amd_noavx2.txt | 1 + requirements/full/requirements_apple_intel.txt | 1 + requirements/full/requirements_apple_silicon.txt | 1 + requirements/full/requirements_cpu_only.txt | 1 + requirements/full/requirements_cpu_only_noavx2.txt | 1 + requirements/full/requirements_cuda128.txt | 1 + requirements/full/requirements_cuda128_noavx2.txt | 1 + requirements/full/requirements_noavx2.txt | 1 + requirements/full/requirements_nowheels.txt | 1 + 11 files changed, 11 insertions(+) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 791a45eb..49000383 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -24,6 +24,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index af117c5c..b72b6806 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -23,6 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 8095c38c..898f3451 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -23,6 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index d63207b6..926baaf0 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -23,6 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index a2add551..b1fbebd6 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -23,6 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index ec30e05d..7cb2b2da 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -23,6 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index d7cf75db..1aa90fbd 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -23,6 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index dd5c08ac..2f52baf9 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -24,6 +24,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 17dd31fd..d06a235e 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -24,6 +24,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index be914200..a5231cde 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -24,6 +24,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 4f35ffcc..e4458ff6 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -23,6 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb From e523f25b9fef994a8f7dd29d6713aafb0e53248f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 10 Jul 2025 05:35:57 -0700 Subject: [PATCH 10/23] Downgrade triton-windows to 3.2.0.post19 https://github.com/oobabooga/text-generation-webui/issues/7107#issuecomment-3057250374 --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- requirements/full/requirements_cpu_only_noavx2.txt | 2 +- requirements/full/requirements_cuda128.txt | 2 +- requirements/full/requirements_cuda128_noavx2.txt | 2 +- requirements/full/requirements_noavx2.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index ab1ffce6..6d90e01c 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -24,7 +24,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index ab88dae9..67b08f3f 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -23,7 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 553346ae..94e9cb7f 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -23,7 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index a053374d..e64ca5c9 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -23,7 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 15dd1f90..14f98782 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -23,7 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index bb02e30d..6ed6bb91 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -23,7 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 5da4a198..8ab631a0 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -23,7 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index f8dca352..bf9ee6cb 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -24,7 +24,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index a3860b6a..691f2ca4 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -24,7 +24,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 2f95e6f7..f569b1d4 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -24,7 +24,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 2e293dcc..69a82184 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -23,7 +23,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.3.1.post19; platform_system == "Windows" +triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb From 0f3a88057ca2e45fe8cae9256aed417c3a0e8601 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 10 Jul 2025 05:39:04 -0700 Subject: [PATCH 11/23] Don't downgrade triton-windows on CUDA 12.8 --- requirements/full/requirements_cuda128.txt | 2 +- requirements/full/requirements_cuda128_noavx2.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index bf9ee6cb..f8dca352 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -24,7 +24,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.2.0.post19; platform_system == "Windows" +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 691f2ca4..a3860b6a 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -24,7 +24,7 @@ scipy sentencepiece tensorboard transformers==4.53.* -triton-windows==3.2.0.post19; platform_system == "Windows" +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb From 635e6efd18cbf12acd9c32142763c1623205947c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 10 Jul 2025 07:04:47 -0700 Subject: [PATCH 12/23] Ignore add_bos_token in instruct prompts, let the jinja2 template decide --- modules/chat.py | 4 ++++ modules/text_generation.py | 11 +++-------- modules/ui_parameters.py | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 9290dd62..827b6050 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -643,6 +643,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess output = apply_extensions('history', output) state = apply_extensions('state', state) + # Let the jinja2 template handle the BOS token + if state['mode'] in ['instruct', 'chat-instruct']: + state['add_bos_token'] = False + # Initialize metadata if not present if 'metadata' not in output: output['metadata'] = {} diff --git a/modules/text_generation.py b/modules/text_generation.py index a75141f1..8d1950b9 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -134,7 +134,6 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt input_ids = np.array(input_ids).reshape(1, len(input_ids)) else: input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens) - if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None: if add_bos_token: # Add BOS token if missing @@ -142,13 +141,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]]) input_ids = torch.cat((bos_tensor, input_ids), 1) - # Prevent double BOS tokens from jinja templates - while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id: - input_ids = input_ids[:, 1:] - else: - # Remove BOS tokens when not wanted - while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id: - input_ids = input_ids[:, 1:] + # Always prevent double BOS tokens (regardless of add_bos_token setting) + while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id: + input_ids = input_ids[:, 1:] if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index 5c92f32e..45bef5f9 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -84,7 +84,7 @@ def create_ui(): shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.') shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.') - shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.') + shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Only applies to text completion (notebook). In chat mode, templates control BOS tokens.') shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.') shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming') shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.') From 188c7c8f2b987d0af95e2b3c9df2004cda269941 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 10 Jul 2025 18:42:52 -0700 Subject: [PATCH 13/23] Revert "CSS simplifications" This reverts commit c6c1b725e95594f03891cdabe0be51ca8cb33c0a. --- css/chat_style-Dark.css | 20 +++++++++++++++++++- css/chat_style-TheEncrypted777.css | 20 +++++++++++++++++++- css/chat_style-cai-chat.css | 21 ++++++++++++++++++++- css/chat_style-messenger.css | 23 ++++++++++++++++++++++- css/chat_style-wpp.css | 20 +++++++++++++++++++- css/html_readable_style.css | 12 +++++++++++- 6 files changed, 110 insertions(+), 6 deletions(-) diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css index 11603ff5..aae82f8b 100644 --- a/css/chat_style-Dark.css +++ b/css/chat_style-Dark.css @@ -35,6 +35,10 @@ color: #f0f0f0; /* Light text color for readability */ } +.text p { + margin-top: 2px; +} + .username { padding-left: 10px; font-size: 20px; @@ -125,7 +129,7 @@ } } -/* Standard spacing from instruct style */ +/* Standard spacing */ .chat .message-body :is(p, ul, ol) { margin: 1.25em 0 !important; } @@ -137,3 +141,17 @@ .chat .message-body :is(p, ul, ol):last-child { margin-bottom: 0 !important; } + +.chat .message-body ul, .chat .message-body ol { + padding-inline-start: 2em; +} + +.message-body li { + list-style-position: outside; + margin-top: 0.5em !important; + margin-bottom: 0.5em !important; +} + +.message-body li > p { + display: inline !important; +} diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css index e1f37839..7f37377d 100644 --- a/css/chat_style-TheEncrypted777.css +++ b/css/chat_style-TheEncrypted777.css @@ -38,6 +38,10 @@ text-shadow: 2px 2px 2px rgb(0 0 0 / 40%); } +.text p { + margin-top: 2px; +} + .username { padding-left: 10px; font-size: 22px; @@ -131,7 +135,7 @@ } } -/* Standard spacing from instruct style */ +/* Standard spacing */ .chat .message-body :is(p, ul, ol) { margin: 1.25em 0 !important; } @@ -143,3 +147,17 @@ .chat .message-body :is(p, ul, ol):last-child { margin-bottom: 0 !important; } + +.chat .message-body ul, .chat .message-body ol { + padding-inline-start: 2em; +} + +.message-body li { + list-style-position: outside; + margin-top: 0.5em !important; + margin-bottom: 0.5em !important; +} + +.message-body li > p { + display: inline !important; +} diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css index ef1f1f62..d2e8f44a 100644 --- a/css/chat_style-cai-chat.css +++ b/css/chat_style-cai-chat.css @@ -9,6 +9,11 @@ line-height: 22.5px !important; } +.message-body { + margin-top: 3px; + font-size: 15px !important; +} + .circle-you { width: 50px; height: 50px; @@ -56,7 +61,7 @@ font-weight: 500; } -/* Standard spacing from instruct style */ +/* Standard spacing */ .chat .message-body :is(p, ul, ol) { margin: 1.25em 0 !important; } @@ -68,3 +73,17 @@ .chat .message-body :is(p, ul, ol):last-child { margin-bottom: 0 !important; } + +.chat .message-body ul, .chat .message-body ol { + padding-inline-start: 2em; +} + +.message-body li { + list-style-position: outside; + margin-top: 0.5em !important; + margin-bottom: 0.5em !important; +} + +.message-body li > p { + display: inline !important; +} diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css index b6da2bb2..a6189a4a 100644 --- a/css/chat_style-messenger.css +++ b/css/chat_style-messenger.css @@ -68,10 +68,17 @@ max-width: 80%; } +.text p { + margin-top: 5px; +} + .username { font-weight: bold; } +.message-body { +} + .message-body img { max-width: 300px; max-height: 300px; @@ -92,7 +99,7 @@ color: rgb(110 110 110) !important; } -/* Standard spacing from instruct style */ +/* Standard spacing */ .chat .message-body :is(p, ul, ol) { margin: 1.25em 0 !important; } @@ -104,3 +111,17 @@ .chat .message-body :is(p, ul, ol):last-child { margin-bottom: 0 !important; } + +.chat .message-body ul, .chat .message-body ol { + padding-inline-start: 2em; +} + +.message-body li { + list-style-position: outside; + margin-top: 0.5em !important; + margin-bottom: 0.5em !important; +} + +.message-body li > p { + display: inline !important; +} diff --git a/css/chat_style-wpp.css b/css/chat_style-wpp.css index d11172ef..c86e577d 100644 --- a/css/chat_style-wpp.css +++ b/css/chat_style-wpp.css @@ -83,6 +83,10 @@ font-weight: 400; } +.message-body p:first-child { + margin-top: 0 !important; +} + .dark .message-body p em { color: rgb(170 170 170) !important; } @@ -96,7 +100,7 @@ margin-top: 8px; } -/* Standard spacing from instruct style */ +/* Standard spacing */ .chat .message-body :is(p, ul, ol) { margin: 1.25em 0 !important; } @@ -108,3 +112,17 @@ .chat .message-body :is(p, ul, ol):last-child { margin-bottom: 0 !important; } + +.chat .message-body ul, .chat .message-body ol { + padding-inline-start: 2em; +} + +.message-body li { + list-style-position: outside; + margin-top: 0.5em !important; + margin-bottom: 0.5em !important; +} + +.message-body li > p { + display: inline !important; +} diff --git a/css/html_readable_style.css b/css/html_readable_style.css index 74f69b73..74a913e9 100644 --- a/css/html_readable_style.css +++ b/css/html_readable_style.css @@ -31,7 +31,7 @@ font-size: 14px; } -/* Standard spacing from instruct style */ +/* Standard spacing */ .readable-container :is(p, ul, ol) { margin: 1.25em 0 !important; } @@ -43,3 +43,13 @@ .readable-container :is(p, ul, ol):last-child { margin-bottom: 0 !important; } + +.readable-container ul, .readable-container ol { + padding-inline-start: 2em; +} + +.readable-container li { + list-style-position: outside; + margin-top: 0.5em !important; + margin-bottom: 0.5em !important; +} From caf69d871a74b603f52fc5f0fe644c8ec80b07f3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 10 Jul 2025 18:43:01 -0700 Subject: [PATCH 14/23] Revert "Standardize margins and paddings across all chat styles" This reverts commit 86cb5e058711633bd4fa45faabcac699020386e5. --- css/chat_style-Dark.css | 28 +-------------------------- css/chat_style-TheEncrypted777.css | 28 +-------------------------- css/chat_style-cai-chat.css | 31 ++++-------------------------- css/chat_style-messenger.css | 28 +-------------------------- css/chat_style-wpp.css | 27 ++------------------------ css/html_readable_style.css | 28 +++------------------------ 6 files changed, 12 insertions(+), 158 deletions(-) diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css index aae82f8b..6a4784cc 100644 --- a/css/chat_style-Dark.css +++ b/css/chat_style-Dark.css @@ -91,6 +91,7 @@ } .message-body p { + margin-bottom: 0 !important; font-size: 16px !important; line-height: 1.5 !important; color: #e0e0e0 !important; /* Light color for text */ @@ -128,30 +129,3 @@ font-size: 18px; /* Smaller username for mobile */ } } - -/* Standard spacing */ -.chat .message-body :is(p, ul, ol) { - margin: 1.25em 0 !important; -} - -.chat .message-body :is(p, ul, ol):first-child { - margin-top: 0 !important; -} - -.chat .message-body :is(p, ul, ol):last-child { - margin-bottom: 0 !important; -} - -.chat .message-body ul, .chat .message-body ol { - padding-inline-start: 2em; -} - -.message-body li { - list-style-position: outside; - margin-top: 0.5em !important; - margin-bottom: 0.5em !important; -} - -.message-body li > p { - display: inline !important; -} diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css index 7f37377d..fbd47072 100644 --- a/css/chat_style-TheEncrypted777.css +++ b/css/chat_style-TheEncrypted777.css @@ -87,6 +87,7 @@ } .message-body p { + margin-bottom: 0 !important; font-size: 18px !important; line-height: 1.428571429 !important; color: rgb(243 244 246) !important; @@ -134,30 +135,3 @@ font-size: 20px; } } - -/* Standard spacing */ -.chat .message-body :is(p, ul, ol) { - margin: 1.25em 0 !important; -} - -.chat .message-body :is(p, ul, ol):first-child { - margin-top: 0 !important; -} - -.chat .message-body :is(p, ul, ol):last-child { - margin-bottom: 0 !important; -} - -.chat .message-body ul, .chat .message-body ol { - padding-inline-start: 2em; -} - -.message-body li { - list-style-position: outside; - margin-top: 0.5em !important; - margin-bottom: 0.5em !important; -} - -.message-body li > p { - display: inline !important; -} diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css index d2e8f44a..b06b1269 100644 --- a/css/chat_style-cai-chat.css +++ b/css/chat_style-cai-chat.css @@ -52,6 +52,10 @@ font-weight: 500; } +.message-body p, .chat .message-body ul, .chat .message-body ol { + margin-bottom: 10px !important; +} + .dark .message-body p em { color: rgb(138 138 138) !important; } @@ -60,30 +64,3 @@ color: rgb(110 110 110) !important; font-weight: 500; } - -/* Standard spacing */ -.chat .message-body :is(p, ul, ol) { - margin: 1.25em 0 !important; -} - -.chat .message-body :is(p, ul, ol):first-child { - margin-top: 0 !important; -} - -.chat .message-body :is(p, ul, ol):last-child { - margin-bottom: 0 !important; -} - -.chat .message-body ul, .chat .message-body ol { - padding-inline-start: 2em; -} - -.message-body li { - list-style-position: outside; - margin-top: 0.5em !important; - margin-bottom: 0.5em !important; -} - -.message-body li > p { - display: inline !important; -} diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css index a6189a4a..65af5f7a 100644 --- a/css/chat_style-messenger.css +++ b/css/chat_style-messenger.css @@ -86,6 +86,7 @@ } .message-body p { + margin-bottom: 0 !important; font-size: 15px !important; line-height: 1.428571429 !important; font-weight: 500; @@ -98,30 +99,3 @@ .message-body p em { color: rgb(110 110 110) !important; } - -/* Standard spacing */ -.chat .message-body :is(p, ul, ol) { - margin: 1.25em 0 !important; -} - -.chat .message-body :is(p, ul, ol):first-child { - margin-top: 0 !important; -} - -.chat .message-body :is(p, ul, ol):last-child { - margin-bottom: 0 !important; -} - -.chat .message-body ul, .chat .message-body ol { - padding-inline-start: 2em; -} - -.message-body li { - list-style-position: outside; - margin-top: 0.5em !important; - margin-bottom: 0.5em !important; -} - -.message-body li > p { - display: inline !important; -} diff --git a/css/chat_style-wpp.css b/css/chat_style-wpp.css index c86e577d..b2ac4d2a 100644 --- a/css/chat_style-wpp.css +++ b/css/chat_style-wpp.css @@ -100,29 +100,6 @@ margin-top: 8px; } -/* Standard spacing */ -.chat .message-body :is(p, ul, ol) { - margin: 1.25em 0 !important; -} - -.chat .message-body :is(p, ul, ol):first-child { - margin-top: 0 !important; -} - -.chat .message-body :is(p, ul, ol):last-child { - margin-bottom: 0 !important; -} - -.chat .message-body ul, .chat .message-body ol { - padding-inline-start: 2em; -} - -.message-body li { - list-style-position: outside; - margin-top: 0.5em !important; - margin-bottom: 0.5em !important; -} - -.message-body li > p { - display: inline !important; +.message-body p, .chat .message-body ul, .chat .message-body ol { + margin-bottom: 10px !important; } diff --git a/css/html_readable_style.css b/css/html_readable_style.css index 74a913e9..760649d6 100644 --- a/css/html_readable_style.css +++ b/css/html_readable_style.css @@ -11,8 +11,9 @@ .readable-container p, .readable-container li { font-size: 16px !important; - line-height: 1.4 !important; color: #efefef !important; + margin-bottom: 22px; + line-height: 1.4 !important; } .readable-container li > p { @@ -29,27 +30,4 @@ .readable-container .hoverable { font-size: 14px; -} - -/* Standard spacing */ -.readable-container :is(p, ul, ol) { - margin: 1.25em 0 !important; -} - -.readable-container :is(p, ul, ol):first-child { - margin-top: 0 !important; -} - -.readable-container :is(p, ul, ol):last-child { - margin-bottom: 0 !important; -} - -.readable-container ul, .readable-container ol { - padding-inline-start: 2em; -} - -.readable-container li { - list-style-position: outside; - margin-top: 0.5em !important; - margin-bottom: 0.5em !important; -} +} \ No newline at end of file From 273888f2181fb6377a0921fb0eb198e5a0c7ca4c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 10 Jul 2025 18:56:46 -0700 Subject: [PATCH 15/23] Revert "Use eager attention by default instead of sdpa" This reverts commit bd4881c4dcead542cd8abf7fc2020b259b8f7af6. --- modules/shared.py | 2 +- modules/ui_model_menu.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index 0348d41e..d368062b 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -61,7 +61,7 @@ group.add_argument('--no-cache', action='store_true', help='Set use_cache to Fal group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.') group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.') -group.add_argument('--attn-implementation', type=str, default='eager', metavar="IMPLEMENTATION", help='Attention implementation. Valid options: eager, sdpa, flash_attention_2.') +group.add_argument('--attn-implementation', type=str, default='sdpa', metavar="IMPLEMENTATION", help='Attention implementation. Valid options: sdpa, eager, flash_attention_2.') group.add_argument('--torch-compile', action='store_true', help='Compile the model with torch.compile for improved performance.') # bitsandbytes 4-bit diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 365ca7e7..2018a943 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -44,7 +44,7 @@ def create_ui(): shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') - shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['eager', 'sdpa', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.') + shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') with gr.Column(): shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) From 5a8a9c22e855cb8886c3215deacf1786d5c94066 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 11 Jul 2025 09:20:27 -0700 Subject: [PATCH 16/23] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_cuda128.txt | 4 ++-- requirements/full/requirements_cuda128_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 38 insertions(+), 38 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 6d90e01c..69cf32ad 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 67b08f3f..9bce8e57 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 94e9cb7f..6edf1b2e 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index e64ca5c9..82419dd2 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 14f98782..ac823e68 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 6ed6bb91..0b1514d9 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 8ab631a0..82742176 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index f8dca352..6c890c6b 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index a3860b6a..05b90d3b 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index f569b1d4..d3b3efbf 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index e9fa61f1..f3d97d52 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 4f9b0668..3cf8fcd1 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 9f403a0b..52b7e72e 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 8e9baa32..b5b114cc 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index ebd63591..43866082 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 6802a337..efb62cbb 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 624c222d..1f0f18f3 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index edb82a31..75a04a26 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.24.0/llama_cpp_binaries-0.24.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 1d1b20bd77196cf07243e444eb8ed105cf7654d0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 11 Jul 2025 10:51:23 -0700 Subject: [PATCH 17/23] Remove the --torch-compile option (it doesn't do anything currently) --- README.md | 9 ++++----- modules/loaders.py | 1 - modules/shared.py | 1 - modules/transformers_loader.py | 3 --- modules/ui.py | 1 - modules/ui_model_menu.py | 3 +-- 6 files changed, 5 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 2f7fe4f5..907d8c38 100644 --- a/README.md +++ b/README.md @@ -235,9 +235,9 @@ List of command-line flags ```txt usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS] [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] - [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--torch-compile] [--load-in-4bit] - [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] - [--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N] + [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] + [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock] + [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N] [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] @@ -278,8 +278,7 @@ Transformers/Accelerate: --trust-remote-code Set trust_remote_code=True while loading the model. Necessary for some models. --force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution. --no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. - --attn-implementation IMPLEMENTATION Attention implementation. Valid options: eager, sdpa, flash_attention_2. - --torch-compile Compile the model with torch.compile for improved performance. + --attn-implementation IMPLEMENTATION Attention implementation. Valid options: sdpa, eager, flash_attention_2. bitsandbytes 4-bit: --load-in-4bit Load the model with 4-bit precision (using bitsandbytes). diff --git a/modules/loaders.py b/modules/loaders.py index 2b2c6b78..f515aeca 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -39,7 +39,6 @@ loaders_and_params = OrderedDict({ 'quant_type', 'load_in_8bit', 'load_in_4bit', - 'torch_compile', 'attn_implementation', 'cpu', 'disk', diff --git a/modules/shared.py b/modules/shared.py index d368062b..5e3e11c0 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -62,7 +62,6 @@ group.add_argument('--trust-remote-code', action='store_true', help='Set trust_r group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.') group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.') group.add_argument('--attn-implementation', type=str, default='sdpa', metavar="IMPLEMENTATION", help='Attention implementation. Valid options: sdpa, eager, flash_attention_2.') -group.add_argument('--torch-compile', action='store_true', help='Compile the model with torch.compile for improved performance.') # bitsandbytes 4-bit group = parser.add_argument_group('bitsandbytes 4-bit') diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py index e4163b6d..2f7367a4 100644 --- a/modules/transformers_loader.py +++ b/modules/transformers_loader.py @@ -258,9 +258,6 @@ def load_model_HF(model_name): print() model = LoaderClass.from_pretrained(path_to_model, **params) - if shared.args.torch_compile: - model = torch.compile(model) - return model diff --git a/modules/ui.py b/modules/ui.py index 9d6777f0..98acc038 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -142,7 +142,6 @@ def list_model_elements(): 'num_experts_per_token', 'load_in_8bit', 'load_in_4bit', - 'torch_compile', 'flash_attn', 'attn_implementation', 'cpu', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 2018a943..ba7c529e 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -52,7 +52,6 @@ def create_ui(): shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) - shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.') shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.') shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).') @@ -131,7 +130,7 @@ def create_ui(): def create_event_handlers(): mu = shared.args.multi_user if mu: - return + return shared.gradio['loader'].change(loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params()), show_progress=False) From 845432b9b47d948adad872cc593e85de4c96c5b9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 14 Jul 2025 21:03:18 -0700 Subject: [PATCH 18/23] Remove the obsolete modules/relative_imports.py file --- modules/relative_imports.py | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 modules/relative_imports.py diff --git a/modules/relative_imports.py b/modules/relative_imports.py deleted file mode 100644 index 3c0eb56b..00000000 --- a/modules/relative_imports.py +++ /dev/null @@ -1,13 +0,0 @@ -import sys -from pathlib import Path - - -class RelativeImport: - def __init__(self, path): - self.import_path = Path(path) - - def __enter__(self): - sys.path.insert(0, str(self.import_path)) - - def __exit__(self, exc_type, exc_value, traceback): - sys.path.remove(str(self.import_path)) From 03fb85e49ab8d8d9019db09d67081ae3a540f3d9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 15 Jul 2025 07:37:13 -0700 Subject: [PATCH 19/23] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_cuda128.txt | 4 ++-- requirements/full/requirements_cuda128_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 38 insertions(+), 38 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 69cf32ad..a45a1446 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 9bce8e57..b67e97a1 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 6edf1b2e..bf02245d 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 82419dd2..5b2dac77 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index ac823e68..9e2ef7ba 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 0b1514d9..34958595 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 82742176..09407740 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 6c890c6b..432018c5 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 05b90d3b..a737c5e1 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index d3b3efbf..914feeae 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index f3d97d52..ceec09aa 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 3cf8fcd1..a53cbed8 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 52b7e72e..823ec028 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index b5b114cc..ce6c215b 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 43866082..1db63e55 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index efb62cbb..b8929f17 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 1f0f18f3..612c9ef2 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 75a04a26..bf504c95 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.26.0/llama_cpp_binaries-0.26.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 9371867238c4a38cfe955067fce7bcadfaef4205 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 15 Jul 2025 07:38:03 -0700 Subject: [PATCH 20/23] Update exllamav2 --- requirements/full/requirements.txt | 6 +++--- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cuda128.txt | 6 +++--- requirements/full/requirements_cuda128_noavx2.txt | 6 +++--- requirements/full/requirements_noavx2.txt | 6 +++--- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index a45a1446..946b312e 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -38,8 +38,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index b67e97a1..8047ed78 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -35,5 +35,5 @@ tiktoken # AMD wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index bf02245d..1736fe58 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -35,5 +35,5 @@ tiktoken # AMD wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 5b2dac77..964ec5bd 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -36,4 +36,4 @@ tiktoken https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4-py3-none-any.whl -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 9e2ef7ba..30547404 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -37,4 +37,4 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4-py3-none-any.whl -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 432018c5..3b60343c 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -38,8 +38,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index a737c5e1..73c21669 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -38,8 +38,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 914feeae..eb123b99 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -38,8 +38,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" From a00983b2ba1341c6e17f8f37aa9386252668dc6e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 19 Jul 2025 12:05:50 -0700 Subject: [PATCH 21/23] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_cuda128.txt | 4 ++-- requirements/full/requirements_cuda128_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 38 insertions(+), 38 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 946b312e..b83cc987 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 8047ed78..8224d987 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 1736fe58..22141a8a 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 964ec5bd..e9290bcb 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 30547404..449fde87 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 34958595..8a84e403 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 09407740..9488f5e7 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 3b60343c..c69cfb5d 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 73c21669..4f24e459 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index eb123b99..537b2004 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index ceec09aa..53479a80 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index a53cbed8..d7336d2f 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 823ec028..1edaa515 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index ce6c215b..04c9b283 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 1db63e55..3c3563d3 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index b8929f17..cf0d7b11 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 612c9ef2..9bd8a37c 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index bf504c95..b8519553 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.27.0/llama_cpp_binaries-0.27.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From ccf5e3e3a7d20d04559f37fbc8ca3ed346affc58 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 19 Jul 2025 12:07:38 -0700 Subject: [PATCH 22/23] Update exllamav3 --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cuda128.txt | 4 ++-- requirements/full/requirements_cuda128_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index b83cc987..687f1f5a 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -36,8 +36,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index e9290bcb..04325464 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -35,5 +35,5 @@ tiktoken # Mac wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 449fde87..9497575f 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -36,5 +36,5 @@ tiktoken https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index c69cfb5d..a2af5108 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -36,8 +36,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 4f24e459..948a275a 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -36,8 +36,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 537b2004..8f7106e4 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -36,8 +36,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.4/exllamav3-0.0.4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" From 0c667de7a75d97bf8066e83349f3b1157b0e35ba Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 19 Jul 2025 12:14:41 -0700 Subject: [PATCH 23/23] UI: Add a None option for the speculative decoding model (closes #7145) --- modules/ui_model_menu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index ba7c529e..0ab67e7a 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -62,8 +62,8 @@ def create_ui(): # Speculative decoding with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']: with gr.Row(): - shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu) - ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) + shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=['None'] + utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu) + ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': ['None'] + utils.get_available_models()}, 'refresh-button', interactive=not mu) shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.') shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.')