From 57f6e9af5a8defd67959672e8dc92040be91a0a5 Mon Sep 17 00:00:00 2001 From: altoiddealer Date: Wed, 13 Aug 2025 15:47:27 -0400 Subject: [PATCH 01/25] Set multimodal status during Model Loading (#7199) --- modules/exllamav3.py | 10 +++++----- modules/llama_cpp_server.py | 24 ++++++++++++++++++++---- modules/models.py | 4 ++++ modules/shared.py | 1 + 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 66e25693..e580bbda 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -177,9 +177,6 @@ class Exllamav3Model: Process all possible image inputs and return modified prompt + embeddings. Returns: (processed_prompt, image_embeddings) """ - if not self.is_multimodal(): - return prompt, [] - # Collect images from various sources using shared utilities pil_images = [] @@ -234,8 +231,11 @@ class Exllamav3Model: """ Generate text with streaming using native ExLlamaV3 API """ - # Process images and modify prompt (ExLlamaV3-specific) - prompt, image_embeddings = self._process_images_for_generation(prompt, state) + image_embeddings = [] + + if shared.is_multimodal: + # Process images and modify prompt (ExLlamaV3-specific) + prompt, image_embeddings = self._process_images_for_generation(prompt, state) # Greedy decoding is a special case if state['temperature'] == 0: diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index e82edb90..5953803a 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -8,6 +8,7 @@ import sys import threading import time from pathlib import Path +from typing import Any, List import llama_cpp_binaries import requests @@ -129,10 +130,10 @@ class LlamaServer: return payload - def generate_with_streaming(self, prompt, state): - url = f"http://127.0.0.1:{self.port}/completion" - payload = self.prepare_payload(state) - + def _process_images_for_generation(self, state: dict) -> List[Any]: + """ + Process all possible image inputs and return PIL images + """ pil_images = [] # Source 1: Web UI (from chatbot_wrapper) if 'image_attachments' in state and state['image_attachments']: @@ -144,6 +145,21 @@ class LlamaServer: elif 'raw_images' in state and state['raw_images']: pil_images.extend(state.get('raw_images', [])) + return pil_images + + def is_multimodal(self) -> bool: + """Check if this model supports multimodal input.""" + return shared.args.mmproj not in [None, 'None'] + + def generate_with_streaming(self, prompt, state): + url = f"http://127.0.0.1:{self.port}/completion" + payload = self.prepare_payload(state) + + pil_images = [] + + if shared.is_multimodal: + pil_images = self._process_images_for_generation(state) + if pil_images: # Multimodal case IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image diff --git a/modules/models.py b/modules/models.py index cc500a40..938eed3d 100644 --- a/modules/models.py +++ b/modules/models.py @@ -55,6 +55,10 @@ def load_model(model_name, loader=None): if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp': shared.settings['truncation_length'] = shared.args.ctx_size + shared.is_multimodal = False + if loader.lower() in ('exllamav3', 'llama.cpp'): + shared.is_multimodal = model.is_multimodal() + logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.") logger.info(f"LOADER: \"{loader}\"") logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}") diff --git a/modules/shared.py b/modules/shared.py index e9d8a62f..a1f4571e 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -16,6 +16,7 @@ model = None tokenizer = None model_name = 'None' is_seq2seq = False +is_multimodal = False model_dirty_from_training = False lora_names = [] From 73a8a737b23fc195c52ef1d9021993fd13e28e33 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 13 Aug 2025 18:23:18 -0700 Subject: [PATCH 02/25] docs: Improve the multimodal examples slightly --- docs/12 - OpenAI API.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index 83bba41f..227541a3 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -93,7 +93,10 @@ curl http://127.0.0.1:5000/v1/chat/completions \ {"type": "image_url", "image_url": {"url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"}} ] } - ] + ], + "temperature": 0.6, + "top_p": 0.95, + "top_k": 20 }' ``` @@ -127,7 +130,10 @@ curl http://127.0.0.1:5000/v1/completions \ } ] } - ] + ], + "temperature": 0.6, + "top_p": 0.95, + "top_k": 20 }' ``` From d771ca4a13b9837e169cd44815bb3a86bc6c8a4b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 14 Aug 2025 12:02:30 -0700 Subject: [PATCH 03/25] Fix web search (attempt) --- modules/web_search.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/modules/web_search.py b/modules/web_search.py index 3b1f6e18..597af4b2 100644 --- a/modules/web_search.py +++ b/modules/web_search.py @@ -1,6 +1,8 @@ import concurrent.futures import html +import random import re +import urllib.request from concurrent.futures import as_completed from datetime import datetime from urllib.parse import quote_plus @@ -50,16 +52,21 @@ def download_web_page(url, timeout=10): def perform_web_search(query, num_pages=3, max_workers=5, timeout=10): """Perform web search and return results with content""" try: - # Use DuckDuckGo HTML search endpoint search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}" - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} - response = requests.get(search_url, headers=headers, timeout=timeout) - response.raise_for_status() + agents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" + ] + + response_text = "" + req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)}) + with urllib.request.urlopen(req, timeout=timeout) as response: + response_text = response.read().decode('utf-8') # Extract results with regex - titles = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response.text, re.DOTALL) - urls = re.findall(r']*class="[^"]*result__url[^"]*"[^>]*>(.*?)', response.text, re.DOTALL) + titles = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response_text, re.DOTALL) + urls = re.findall(r']*class="[^"]*result__url[^"]*"[^>]*>(.*?)', response_text, re.DOTALL) # Prepare download tasks download_tasks = [] From dbabe67e776d46bb0a84987a9c484a59bd75d8db Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 13:19:11 -0700 Subject: [PATCH 04/25] ExLlamaV3: Enable the --enable-tp option, add a --tp-backend option --- modules/exllamav3.py | 5 +++++ modules/loaders.py | 2 ++ modules/shared.py | 6 +++++- modules/ui.py | 1 + modules/ui_model_menu.py | 4 +++- 5 files changed, 16 insertions(+), 2 deletions(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index e580bbda..73962977 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -91,6 +91,11 @@ class Exllamav3Model: split = [float(alloc) for alloc in shared.args.gpu_split.split(",")] load_params['use_per_device'] = split + # Tensor-parallelism + if shared.args.enable_tp: + load_params['tensor_p'] = True + load_params['tp_backend'] = shared.args.tp_backend + model.load(**load_params) tokenizer = Tokenizer.from_config(config) diff --git a/modules/loaders.py b/modules/loaders.py index 8b7e6cce..295db1e7 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -65,6 +65,8 @@ loaders_and_params = OrderedDict({ 'draft_max', 'ctx_size_draft', 'speculative_decoding_accordion', + 'enable_tp', + 'tp_backend', ], 'ExLlamav2_HF': [ 'ctx_size', diff --git a/modules/shared.py b/modules/shared.py index a1f4571e..644261a0 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -101,6 +101,11 @@ group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1') group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.') +# ExLlamaV3 +group = parser.add_argument_group('ExLlamaV3') +group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) to split the model across GPUs.') +group.add_argument('--tp-backend', type=str, default='native', help='The backend for tensor parallelism. Valid options: native, nccl. Default: native.') + # ExLlamaV2 group = parser.add_argument_group('ExLlamaV2') group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.') @@ -110,7 +115,6 @@ group.add_argument('--no_flash_attn', action='store_true', help='Force flash-att group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.') group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.') group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.') -group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.') # TensorRT-LLM group = parser.add_argument_group('TensorRT-LLM') diff --git a/modules/ui.py b/modules/ui.py index 1171cd48..502005e7 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -155,6 +155,7 @@ def list_model_elements(): 'bf16', 'autosplit', 'enable_tp', + 'tp_backend', 'no_flash_attn', 'no_xformers', 'no_sdpa', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 6972a17e..dd240627 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -46,6 +46,8 @@ def create_ui(): shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') + shared.gradio['tp_backend'] = gr.Dropdown(label="tp-backend", choices=['native', 'nccl'], value=shared.args.tp_backend, info='The backend for tensor parallelism.') + with gr.Column(): shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') @@ -54,7 +56,7 @@ def create_ui(): shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.') shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') - shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).') + shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).') shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.') shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') From a633793a0007d435ed4f2cf08f0fbb4b77651b91 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 13:19:42 -0700 Subject: [PATCH 05/25] Bump exllamav3 to 0.0.6 --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cuda128.txt | 4 ++-- requirements/full/requirements_cuda128_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index eb7742b1..d0282ee9 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -36,8 +36,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 0890b2a5..13361a78 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -35,5 +35,5 @@ tiktoken # Mac wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index da3010c6..e4a30168 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -36,5 +36,5 @@ tiktoken https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 807d0a21..eeee9ff3 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -36,8 +36,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 41e96574..d08f23ca 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -36,8 +36,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 72ba7103..71fd227d 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -36,8 +36,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" From 9651b5c873649e5c967142f9f78e7ad6cf59aaf5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 13:22:55 -0700 Subject: [PATCH 06/25] Make CUDA 12.8 the default CUDA option, remove the CUDA 12.4 option Exllamav3 doesn't compile with torch 2.6 anymore, and torch 2.7 requires newer CUDA. --- one_click.py | 20 +++++++-- requirements/full/requirements.txt | 14 +++--- requirements/full/requirements_cuda128.txt | 45 ------------------- .../full/requirements_cuda128_noavx2.txt | 45 ------------------- requirements/full/requirements_noavx2.txt | 14 +++--- 5 files changed, 30 insertions(+), 108 deletions(-) delete mode 100644 requirements/full/requirements_cuda128.txt delete mode 100644 requirements/full/requirements_cuda128_noavx2.txt diff --git a/one_click.py b/one_click.py index 050da76b..486e893e 100644 --- a/one_click.py +++ b/one_click.py @@ -16,7 +16,7 @@ import sys # os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030' # Define the required versions -TORCH_VERSION = "2.6.0" +TORCH_VERSION = "2.7.0" PYTHON_VERSION = "3.11" LIBSTDCXX_VERSION_LINUX = "12.1.0" @@ -113,17 +113,16 @@ def get_gpu_choice(): choice = get_user_choice( "What is your GPU?", { - 'A': 'NVIDIA - CUDA 12.4', + 'A': 'NVIDIA', 'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4', 'C': 'Apple M Series', 'D': 'Intel Arc (beta)', - 'E': 'NVIDIA - CUDA 12.8', 'N': 'CPU mode' }, ) # Convert choice to GPU name - gpu_choice = {"A": "NVIDIA", "B": "AMD", "C": "APPLE", "D": "INTEL", "E": "NVIDIA_CUDA128", "N": "NONE"}[choice] + gpu_choice = {"A": "NVIDIA_CUDA128", "B": "AMD", "C": "APPLE", "D": "INTEL", "N": "NONE"}[choice] # Save choice to state state['gpu_choice'] = gpu_choice @@ -368,6 +367,19 @@ def update_requirements(initial_installation=False, pull=True): assert_success=True ) + # Check for outdated CUDA 12.4 installs and refuse to update + state = load_state() + if state.get('gpu_choice') == 'NVIDIA': + print_big_message( + "Your current installation uses CUDA 12.4, which has been removed.\n" + "To update to the new default (CUDA 12.8), a clean installation is required.\n\n" + "INSTRUCTIONS:\n" + "1. Delete the 'installer_files' folder in your text-generation-webui directory.\n" + "2. Run the start script again (e.g., start_windows.bat).\n\n" + "This will create a fresh environment with the latest software." + ) + sys.exit(0) + current_commit = get_current_commit() wheels_changed = not os.path.exists(state_file) if not wheels_changed: diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index d0282ee9..eeee9ff3 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -24,7 +24,7 @@ scipy sentencepiece tensorboard transformers==4.55.* -triton-windows==3.2.0.post19; platform_system == "Windows" +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb @@ -36,10 +36,10 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" -https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt deleted file mode 100644 index eeee9ff3..00000000 --- a/requirements/full/requirements_cuda128.txt +++ /dev/null @@ -1,45 +0,0 @@ -accelerate==1.8.* -bitsandbytes==0.46.* -colorama -datasets -einops -fastapi==0.112.4 -gradio==4.37.* -html2text==2025.4.15 -jinja2==3.1.6 -markdown -numpy==2.2.* -pandas -peft==0.16.* -Pillow>=9.5.0 -psutil -pydantic==2.8.2 -PyPDF2==3.0.1 -python-docx==1.1.2 -pyyaml -requests -rich -safetensors==0.5.* -scipy -sentencepiece -tensorboard -transformers==4.55.* -triton-windows==3.3.1.post19; platform_system == "Windows" -tqdm -wandb - -# API -flask_cloudflared==0.0.14 -sse-starlette==1.6.5 -tiktoken - -# CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" -https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt deleted file mode 100644 index d08f23ca..00000000 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ /dev/null @@ -1,45 +0,0 @@ -accelerate==1.8.* -bitsandbytes==0.46.* -colorama -datasets -einops -fastapi==0.112.4 -gradio==4.37.* -html2text==2025.4.15 -jinja2==3.1.6 -markdown -numpy==2.2.* -pandas -peft==0.16.* -Pillow>=9.5.0 -psutil -pydantic==2.8.2 -PyPDF2==3.0.1 -python-docx==1.1.2 -pyyaml -requests -rich -safetensors==0.5.* -scipy -sentencepiece -tensorboard -transformers==4.55.* -triton-windows==3.3.1.post19; platform_system == "Windows" -tqdm -wandb - -# API -flask_cloudflared==0.0.14 -sse-starlette==1.6.5 -tiktoken - -# CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" -https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 71fd227d..d08f23ca 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -24,7 +24,7 @@ scipy sentencepiece tensorboard transformers==4.55.* -triton-windows==3.2.0.post19; platform_system == "Windows" +triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb @@ -36,10 +36,10 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" -https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" From 3a91ca2dd191716be9e9f3f20627c1e1a80d13f1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 13:57:23 -0700 Subject: [PATCH 07/25] Update flash attention --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index eeee9ff3..d57a457c 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -41,5 +41,5 @@ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0. https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" -https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.7cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index d08f23ca..b073a3a9 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -41,5 +41,5 @@ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0. https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" -https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.7cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" From 64eba9576cb806d2213b7efbb82469aa70a9fd71 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 14:08:40 -0700 Subject: [PATCH 08/25] mtmd: Fix a bug when "include past attachments" is unchecked --- modules/chat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 7b1629dd..ab6b43c0 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -269,7 +269,7 @@ def generate_chat_prompt(user_input, state, **kwargs): enhanced_user_msg = user_msg # Add attachment content if present AND if past attachments are enabled - if (state.get('include_past_attachments', True) and user_key in metadata and "attachments" in metadata[user_key]): + if user_key in metadata and "attachments" in metadata[user_key]: attachments_text = "" image_refs = "" @@ -277,7 +277,7 @@ def generate_chat_prompt(user_input, state, **kwargs): if attachment.get("type") == "image": # Add image reference for multimodal models image_refs += "<__media__>" - else: + elif state.get('include_past_attachments', True): # Handle text/PDF attachments filename = attachment.get("name", "file") content = attachment.get("content", "") From 58797a9eb5f386cc8262f6d8f1a152494249c28d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 14:17:20 -0700 Subject: [PATCH 09/25] Minor change after 9651b5c873649e5c967142f9f78e7ad6cf59aaf5 --- one_click.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/one_click.py b/one_click.py index 486e893e..67be9e4b 100644 --- a/one_click.py +++ b/one_click.py @@ -135,9 +135,7 @@ def get_pytorch_install_command(gpu_choice): """Get PyTorch installation command based on GPU choice""" base_cmd = f"python -m pip install torch=={TORCH_VERSION} " - if gpu_choice == "NVIDIA": - return base_cmd + "--index-url https://download.pytorch.org/whl/cu124" - elif gpu_choice == "NVIDIA_CUDA128": + if gpu_choice == "NVIDIA_CUDA128": return "python -m pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128" elif gpu_choice == "AMD": return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.2.4" @@ -156,9 +154,7 @@ def get_pytorch_update_command(gpu_choice): """Get PyTorch update command based on GPU choice""" base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} " - if gpu_choice == "NVIDIA": - return f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124" - elif gpu_choice == "NVIDIA_CUDA128": + if gpu_choice == "NVIDIA_CUDA128": return "python -m pip install --upgrade torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128" elif gpu_choice == "AMD": return f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4" @@ -181,8 +177,6 @@ def get_requirements_file(gpu_choice): file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt" elif gpu_choice in ["INTEL", "NONE"]: file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt" - elif gpu_choice == "NVIDIA": - file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt" elif gpu_choice == "NVIDIA_CUDA128": file_name = f"requirements_cuda128{'_noavx2' if not cpu_has_avx2() else ''}.txt" else: @@ -330,8 +324,6 @@ def install_webui(): cmd_flags_file.write("\n--cpu\n") # Handle CUDA version display - elif any((is_windows(), is_linux())) and gpu_choice == "NVIDIA": - print("CUDA: 12.4") elif any((is_windows(), is_linux())) and gpu_choice == "NVIDIA_CUDA128": print("CUDA: 12.8") From 35707c2dd89a9983f2038ded9bc67d13aa7bc213 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 21:39:57 -0700 Subject: [PATCH 10/25] Update README --- README.md | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 5e7f37de..93d31131 100644 --- a/README.md +++ b/README.md @@ -32,13 +32,13 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## How to install -#### Option 1: Portable builds (get started in 1 minute) +#### ✅ Option 1: Portable builds (get started in 1 minute) No installation needed – just download, unzip and run. All dependencies included. Compatible with GGUF (llama.cpp) models on Windows, Linux, and macOS. -Download from here: https://github.com/oobabooga/text-generation-webui/releases +Download from here: **https://github.com/oobabooga/text-generation-webui/releases** #### Option 2: One-click installer @@ -57,23 +57,6 @@ You can pass command-line flags directly (e.g., `./start_linux.sh --help`), or a To update, run the update script for your OS: `update_wizard_windows.bat`, `update_wizard_linux.sh`, or `update_wizard_macos.sh`. -
- -One-click installer details - - -### One-click-installer - -The script uses Miniforge to set up a Conda environment in the `installer_files` folder. - -If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`. - -* There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root. -* To install requirements for extensions, it is recommended to use the update wizard script with the "Install/update extensions requirements" option. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts. -* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`. - -
-
Manual portable installation with venv @@ -108,6 +91,23 @@ deactivate ```
+
+ +One-click installer details + + +### One-click-installer + +The script uses Miniforge to set up a Conda environment in the `installer_files` folder. + +If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`. + +* There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root. +* To install requirements for extensions, it is recommended to use the update wizard script with the "Install/update extensions requirements" option. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts. +* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`. + +
+
Manual full installation with conda or docker From 3dec47eaf8f94ba085e7d4d06522ce398d04bdbe Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 21:43:46 -0700 Subject: [PATCH 11/25] Small one-click installer changes --- one_click.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/one_click.py b/one_click.py index 67be9e4b..1ea5e59f 100644 --- a/one_click.py +++ b/one_click.py @@ -16,7 +16,7 @@ import sys # os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030' # Define the required versions -TORCH_VERSION = "2.7.0" +TORCH_VERSION = "2.7.1" PYTHON_VERSION = "3.11" LIBSTDCXX_VERSION_LINUX = "12.1.0" @@ -136,7 +136,7 @@ def get_pytorch_install_command(gpu_choice): base_cmd = f"python -m pip install torch=={TORCH_VERSION} " if gpu_choice == "NVIDIA_CUDA128": - return "python -m pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128" + return base_cmd + "--index-url https://download.pytorch.org/whl/cu128" elif gpu_choice == "AMD": return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.2.4" elif gpu_choice in ["APPLE", "NONE"]: From 320f7339cdd3bf52ff705e0679d55fd738e61218 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 21:56:35 -0700 Subject: [PATCH 12/25] Update README --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 93d31131..3832030c 100644 --- a/README.md +++ b/README.md @@ -139,19 +139,19 @@ conda activate textgen | System | GPU | Command | |--------|---------|---------| -| Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu124` | -| Linux/WSL | CPU only | `pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu` | -| Linux | AMD | `pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` | -| MacOS + MPS | Any | `pip3 install torch==2.6.0` | -| Windows | NVIDIA | `pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu124` | -| Windows | CPU only | `pip3 install torch==2.6.0` | +| Linux/WSL | NVIDIA | `pip3 install torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128` | +| Linux/WSL | CPU only | `pip3 install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu` | +| Linux | AMD | `pip3 install torch==2.7.1 --index-url https://download.pytorch.org/whl/rocm6.2.4` | +| MacOS + MPS | Any | `pip3 install torch==2.7.1` | +| Windows | NVIDIA | `pip3 install torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128` | +| Windows | CPU only | `pip3 install torch==2.7.1` | The up-to-date commands can be found here: https://pytorch.org/get-started/locally/. If you need `nvcc` to compile some library manually, you will additionally need to install this: ``` -conda install -y -c "nvidia/label/cuda-12.4.1" cuda +conda install -y -c "nvidia/label/cuda-12.8.1" cuda ``` #### 3. Install the web UI From 6bf31479d92ab9da0c643ca907a90b300a230f25 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 22:00:21 -0700 Subject: [PATCH 13/25] Update README --- README.md | 60 +++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 3832030c..ccde678a 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,35 @@ Compatible with GGUF (llama.cpp) models on Windows, Linux, and macOS. Download from here: **https://github.com/oobabooga/text-generation-webui/releases** -#### Option 2: One-click installer +#### Option 2: Manual portable install with venv + +Very fast setup that should work on any Python 3.9+: + +```bash +# Clone repository +git clone https://github.com/oobabooga/text-generation-webui +cd text-generation-webui + +# Create virtual environment +python -m venv venv + +# Activate virtual environment +# On Windows: +venv\Scripts\activate +# On macOS/Linux: +source venv/bin/activate + +# Install dependencies (choose appropriate file under requirements/portable for your hardware) +pip install -r requirements/portable/requirements.txt --upgrade + +# Launch server (basic command) +python server.py --portable --api --auto-launch + +# When done working, deactivate +deactivate +``` + +#### Option 3: One-click installer For users who need additional backends (ExLlamaV3, Transformers) or extensions (TTS, voice input, translation, etc). Requires ~10GB disk space and downloads PyTorch. @@ -62,36 +90,6 @@ To update, run the update script for your OS: `update_wizard_windows.bat`, `upda Manual portable installation with venv -### Manual portable installation with venv - -Very fast setup that should work on any Python 3.9+: - -```bash -# Clone repository -git clone https://github.com/oobabooga/text-generation-webui -cd text-generation-webui - -# Create virtual environment -python -m venv venv - -# Activate virtual environment -# On Windows: -venv\Scripts\activate -# On macOS/Linux: -source venv/bin/activate - -# Install dependencies (choose appropriate file under requirements/portable for your hardware) -pip install -r requirements/portable/requirements.txt - -# Launch server (basic command) -python server.py --portable --api --auto-launch - -# When done working, deactivate -deactivate -``` -
- -
One-click installer details From 8cdb911a6e637c355dc9eac2ab43f94eab7b3281 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 22:06:12 -0700 Subject: [PATCH 14/25] Update README --- README.md | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index ccde678a..f213f7a9 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,6 @@ A Gradio web UI for Large Language Models. -Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation. - [Try the Deep Reason extension](https://oobabooga.gumroad.com/l/deep_reason) |![Image1](https://github.com/oobabooga/screenshots/raw/main/INSTRUCT-3.5.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/CHAT-3.5.png) | @@ -79,16 +77,11 @@ For users who need additional backends (ExLlamaV3, Transformers) or extensions ( To restart the web UI later, run the same `start_` script. -To reinstall with a fresh Python environment, delete the `installer_files` folder and run the `start_` script again. - You can pass command-line flags directly (e.g., `./start_linux.sh --help`), or add them to `user_data/CMD_FLAGS.txt` (e.g., `--api` to enable the API). To update, run the update script for your OS: `update_wizard_windows.bat`, `update_wizard_linux.sh`, or `update_wizard_macos.sh`. -
- -Manual portable installation with venv - +To reinstall with a fresh Python environment, delete the `installer_files` folder and run the `start_` script again. One-click installer details @@ -236,13 +229,13 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [- [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock] - [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N] - [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT] - [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] - [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] - [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] - [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] - [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] + [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--mmproj MMPROJ] [--ctx-size N] [--cache-type N] + [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--enable-tp] + [--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner] [--deepspeed] + [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] + [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] + [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] + [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] Text generation web UI @@ -299,6 +292,7 @@ llama.cpp: --row-split Split the model by rows across GPUs. This may improve multi-gpu performance. --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU" --streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed. + --mmproj MMPROJ Path to the mmproj file for vision models. Context and cache: --ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens. @@ -312,6 +306,10 @@ Speculative decoding: --device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1 --ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model. +ExLlamaV3: + --enable-tp, --enable_tp Enable Tensor Parallelism (TP) to split the model across GPUs. + --tp-backend TP_BACKEND The backend for tensor parallelism. Valid options: native, nccl. Default: native. + ExLlamaV2: --gpu-split GPU_SPLIT Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. --autosplit Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored. @@ -320,7 +318,6 @@ ExLlamaV2: --no_xformers Force xformers to not be used. --no_sdpa Force Torch SDPA to not be used. --num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral. - --enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2. TensorRT-LLM: --cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet. From 8a14aa62ff369615de895d155b0a105a3b4f7cb8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 22:06:59 -0700 Subject: [PATCH 15/25] Update README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f213f7a9..6e5cb5d8 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ To update, run the update script for your OS: `update_wizard_windows.bat`, `upda To reinstall with a fresh Python environment, delete the `installer_files` folder and run the `start_` script again. +
One-click installer details From 6b1b2e2373df2a17ac48eaf5c53494aa8f4b8a57 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 17 Aug 2025 22:19:20 -0700 Subject: [PATCH 16/25] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6e5cb5d8..6b49cee0 100644 --- a/README.md +++ b/README.md @@ -378,7 +378,7 @@ text-generation-webui └── llama-2-13b-chat.Q4_K_M.gguf ``` -* The remaining model types (like 16-bit Transformers models and EXL2 models) are made of several files and must be placed in a subfolder. Example: +* The remaining model types (like 16-bit Transformers models and EXL3 models) are made of several files and must be placed in a subfolder. Example: ``` text-generation-webui From 15f99b1b710aced1ce8db70748a2a82602457661 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 18 Aug 2025 05:51:04 -0700 Subject: [PATCH 17/25] Installer: Fix a requirement file --- one_click.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/one_click.py b/one_click.py index 1ea5e59f..2c7c2c0c 100644 --- a/one_click.py +++ b/one_click.py @@ -171,14 +171,14 @@ def get_requirements_file(gpu_choice): """Get requirements file path based on GPU choice""" requirements_base = os.path.join("requirements", "full") - if gpu_choice == "AMD": + if gpu_choice == "NVIDIA_CUDA128": + file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt" + elif gpu_choice == "AMD": file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt" elif gpu_choice == "APPLE": file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt" elif gpu_choice in ["INTEL", "NONE"]: file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt" - elif gpu_choice == "NVIDIA_CUDA128": - file_name = f"requirements_cuda128{'_noavx2' if not cpu_has_avx2() else ''}.txt" else: raise ValueError(f"Unknown GPU choice: {gpu_choice}") From 08594e52636d6a6d583a87ecb6fd10e49821c500 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 18 Aug 2025 05:59:46 -0700 Subject: [PATCH 18/25] Installer: Slight improvement --- one_click.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/one_click.py b/one_click.py index 2c7c2c0c..58c0e8ff 100644 --- a/one_click.py +++ b/one_click.py @@ -408,7 +408,7 @@ def update_requirements(initial_installation=False, pull=True): with open(requirements_file, 'r') as f: after_pull_whl_lines = [line for line in f if '.whl' in line] - wheels_changed = wheels_changed or (before_pull_whl_lines != after_pull_whl_lines) + wheels_changed = wheels_changed or (before_pull_whl_lines != after_pull_whl_lines) # Check for changes to installer files for file in files_to_check: From 7d23a55901a43c323d2afe6d8c4585e7c9c3bca2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 18 Aug 2025 09:05:47 -0700 Subject: [PATCH 19/25] Fix model unloading when switching loaders (closes #7203) --- modules/models.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/models.py b/modules/models.py index 938eed3d..e620957b 100644 --- a/modules/models.py +++ b/modules/models.py @@ -128,10 +128,12 @@ def unload_model(keep_model_name=False): if shared.model is None: return - is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer') - if shared.args.loader in ['ExLlamav3_HF', 'ExLlamav3']: + model_class_name = shared.model.__class__.__name__ + is_llamacpp = (model_class_name == 'LlamaServer') + + if model_class_name in ['Exllamav3Model', 'Exllamav3HF']: shared.model.unload() - elif shared.args.loader in ['ExLlamav2_HF', 'ExLlamav2'] and hasattr(shared.model, 'unload'): + elif model_class_name in ['Exllamav2Model', 'Exllamav2HF'] and hasattr(shared.model, 'unload'): shared.model.unload() shared.model = shared.tokenizer = None From 8805a50d24066dd5645b2dbb85595bc07d75c34c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 18 Aug 2025 15:31:01 -0700 Subject: [PATCH 20/25] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index d57a457c..9f906b26 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 47bcb60a..70e031b8 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 6958ce37..81556326 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 13361a78..7b9d3650 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index e4a30168..0fc9162f 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 3a9a953b..3565a994 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index a3e176d3..64c17416 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index b073a3a9..2b162308 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 0c7f1d29..943ea600 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 09f1c502..394b89b6 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 75296cb4..cffe3aea 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index ff3d7cb1..d274e2c8 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 97414bde..47ec086e 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 7f543205..9a0a3694 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index c1764ead..45e96da9 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 142b67ec..9183562e 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From cbba58bef9f70a366413cce145a907658a24a982 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 18 Aug 2025 15:50:09 -0700 Subject: [PATCH 21/25] UI: Fix code blocks having an extra empty line --- modules/html_generator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/html_generator.py b/modules/html_generator.py index cb14a722..279f9ba6 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -306,6 +306,9 @@ def process_markdown_content(string): # Convert to HTML using markdown html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()]) + # Remove extra newlines before + html_output = re.sub(r'\s*', '', html_output) + # Unescape code blocks pattern = re.compile(r']*>(.*?)', re.DOTALL) html_output = pattern.sub(lambda x: html.unescape(x.group()), html_output) From 5b06284a8af7d5bf068210124797fa7e4b31ade4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 19 Aug 2025 06:23:21 -0700 Subject: [PATCH 22/25] UI: Keep ExLlamav3_HF selected if already selected for EXL3 models --- modules/models_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index bf7b1cf9..c325fa0c 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -251,7 +251,7 @@ def apply_model_settings_to_state(model, state): model_settings = get_model_metadata(model) if 'loader' in model_settings: loader = model_settings.pop('loader') - if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']): + if not ((loader == 'ExLlamav2_HF' and state['loader'] == 'ExLlamav2') or (loader == 'ExLlamav3_HF' and state['loader'] == 'ExLlamav3')): state['loader'] = loader for k in model_settings: From e0f5905a97bd40a343003b4626e08b3fec9416de Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 19 Aug 2025 06:34:05 -0700 Subject: [PATCH 23/25] Code formatting --- modules/exllamav3.py | 3 ++- one_click.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 73962977..fd676a00 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -236,11 +236,12 @@ class Exllamav3Model: """ Generate text with streaming using native ExLlamaV3 API """ - image_embeddings = [] if shared.is_multimodal: # Process images and modify prompt (ExLlamaV3-specific) prompt, image_embeddings = self._process_images_for_generation(prompt, state) + else: + image_embeddings = [] # Greedy decoding is a special case if state['temperature'] == 0: diff --git a/one_click.py b/one_click.py index 58c0e8ff..881d7489 100644 --- a/one_click.py +++ b/one_click.py @@ -155,7 +155,7 @@ def get_pytorch_update_command(gpu_choice): base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} " if gpu_choice == "NVIDIA_CUDA128": - return "python -m pip install --upgrade torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128" + return f"{base_cmd} --index-url https://download.pytorch.org/whl/cu128" elif gpu_choice == "AMD": return f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4" elif gpu_choice in ["APPLE", "NONE"]: From 1972479610f4b1482912ff012469e8ab9cbaa908 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 19 Aug 2025 06:48:22 -0700 Subject: [PATCH 24/25] Add the TP option to exllamav3_HF --- modules/exllamav3_hf.py | 5 +++++ modules/loaders.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index 1254ff5d..d9f4ed57 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -74,6 +74,11 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): split = [float(alloc) for alloc in shared.args.gpu_split.split(",")] load_params['use_per_device'] = split + # Tensor-parallelism + if shared.args.enable_tp: + load_params['tensor_p'] = True + load_params['tp_backend'] = shared.args.tp_backend + self.ex_model.load(**load_params) self.past_seq = None self.max_tokens = max_tokens diff --git a/modules/loaders.py b/modules/loaders.py index 295db1e7..f88e976d 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -56,6 +56,8 @@ loaders_and_params = OrderedDict({ 'cfg_cache', 'trust_remote_code', 'no_use_fast', + 'enable_tp', + 'tp_backend', ], 'ExLlamav3': [ 'ctx_size', From 9e7b326e3402de37adadc8509764738e98113763 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 19 Aug 2025 06:50:40 -0700 Subject: [PATCH 25/25] Lint --- modules/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/models.py b/modules/models.py index e620957b..ca3d184f 100644 --- a/modules/models.py +++ b/modules/models.py @@ -55,7 +55,7 @@ def load_model(model_name, loader=None): if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp': shared.settings['truncation_length'] = shared.args.ctx_size - shared.is_multimodal = False + shared.is_multimodal = False if loader.lower() in ('exllamav3', 'llama.cpp'): shared.is_multimodal = model.is_multimodal()