From b391ac8eb1ba63e449f0ef021db56d6513dce646 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Aug 2025 17:51:24 -0700
Subject: [PATCH 01/79] Fix getting the ctx-size for EXL3/EXL2/Transformers
models
---
modules/models_settings.py | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index e35e1c04..4e53dc81 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -106,9 +106,16 @@ def get_model_metadata(model):
for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']:
if k in metadata:
- model_settings['truncation_length'] = metadata[k]
- model_settings['truncation_length_info'] = metadata[k]
- model_settings['ctx_size'] = min(metadata[k], 8192)
+ value = metadata[k]
+ elif k in metadata.get('text_config', {}):
+ value = metadata['text_config'][k]
+ else:
+ continue
+
+ model_settings['truncation_length'] = value
+ model_settings['truncation_length_info'] = value
+ model_settings['ctx_size'] = min(value, 8192)
+ break
if 'rope_theta' in metadata:
model_settings['rope_freq_base'] = metadata['rope_theta']
From 88127f46c124723554b5e87cad9c868348ed4c53 Mon Sep 17 00:00:00 2001
From: Katehuuh <133996730+Katehuuh@users.noreply.github.com>
Date: Sat, 9 Aug 2025 04:31:16 +0200
Subject: [PATCH 02/79] Add multimodal support (ExLlamaV3) (#7174)
---
css/main.css | 13 ++
docs/12 - OpenAI API.md | 18 ++
extensions/openai/completions.py | 102 +++++++++-
extensions/openai/image_utils.py | 97 ++++++++++
extensions/openai/typing.py | 16 +-
modules/chat.py | 126 ++++++++++---
modules/exllamav3.py | 313 +++++++++++++++++++++++++++++++
modules/html_generator.py | 29 ++-
modules/loaders.py | 40 ++++
modules/models.py | 13 +-
modules/shared.py | 2 +
modules/text_generation.py | 10 +-
modules/ui_chat.py | 2 +-
13 files changed, 726 insertions(+), 55 deletions(-)
create mode 100644 extensions/openai/image_utils.py
create mode 100644 modules/exllamav3.py
diff --git a/css/main.css b/css/main.css
index 240a94d5..de16d81d 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1577,6 +1577,19 @@ strong {
margin-top: 4px;
}
+.image-attachment {
+ flex-direction: column;
+}
+
+.image-preview {
+ border-radius: 16px;
+ margin-bottom: 5px;
+ object-fit: cover;
+ object-position: center;
+ border: 2px solid var(--border-color-primary);
+ aspect-ratio: 1 / 1;
+}
+
button:focus {
outline: none;
}
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index ec999397..b7b5fbc1 100644
--- a/docs/12 - OpenAI API.md
+++ b/docs/12 - OpenAI API.md
@@ -77,6 +77,24 @@ curl http://127.0.0.1:5000/v1/chat/completions \
}'
```
+#### Multimodal support (ExLlamaV3)
+
+```shell
+curl http://127.0.0.1:5000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What color is this image?"},
+ {"type": "image_url", "image_url": {"url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"}}
+ ]
+ }
+ ]
+ }'
+```
+
#### SSE streaming
```shell
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 5181b18b..3d389f0b 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -7,6 +7,7 @@ import tiktoken
from pydantic import ValidationError
from extensions.openai.errors import InvalidRequestError
+from extensions.openai.image_utils import convert_openai_messages_to_images
from extensions.openai.typing import ToolDefinition
from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
from modules import shared
@@ -16,6 +17,7 @@ from modules.chat import (
load_character_memoized,
load_instruction_template_memoized
)
+from modules.logging_colors import logger
from modules.presets import load_preset_memoized
from modules.text_generation import decode, encode, generate_reply
@@ -82,6 +84,21 @@ def process_parameters(body, is_legacy=False):
return generate_params
+def process_multimodal_content(content):
+ """Extract text from OpenAI multimodal format for non-multimodal models"""
+ if isinstance(content, str):
+ return content
+
+ if isinstance(content, list):
+ text_parts = []
+ for item in content:
+ if isinstance(item, dict) and item.get('type') == 'text':
+ text_parts.append(item.get('text', ''))
+ return ' '.join(text_parts) if text_parts else str(content)
+
+ return str(content)
+
+
def convert_history(history):
'''
Chat histories in this program are in the format [message, reply].
@@ -99,8 +116,11 @@ def convert_history(history):
role = entry["role"]
if role == "user":
+ # Extract text content (images handled by model-specific code)
+ content = process_multimodal_content(content)
user_input = content
user_input_last = True
+
if current_message:
chat_dialogue.append([current_message, '', ''])
current_message = ""
@@ -126,7 +146,11 @@ def convert_history(history):
if not user_input_last:
user_input = ""
- return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue)}
+ return user_input, system_message, {
+ 'internal': chat_dialogue,
+ 'visible': copy.deepcopy(chat_dialogue),
+ 'messages': history # Store original messages for multimodal models
+ }
def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, prompt_only=False) -> dict:
@@ -150,9 +174,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
elif m['role'] == 'function':
raise InvalidRequestError(message="role: function is not supported.", param='messages')
- if 'content' not in m and "image_url" not in m:
+ # Handle multimodal content validation
+ content = m.get('content')
+ if content is None:
raise InvalidRequestError(message="messages: missing content", param='messages')
+ # Validate multimodal content structure
+ if isinstance(content, list):
+ for item in content:
+ if not isinstance(item, dict) or 'type' not in item:
+ raise InvalidRequestError(message="messages: invalid content item format", param='messages')
+ if item['type'] not in ['text', 'image_url']:
+ raise InvalidRequestError(message="messages: unsupported content type", param='messages')
+ if item['type'] == 'text' and 'text' not in item:
+ raise InvalidRequestError(message="messages: missing text in content item", param='messages')
+ if item['type'] == 'image_url' and ('image_url' not in item or 'url' not in item['image_url']):
+ raise InvalidRequestError(message="messages: missing image_url in content item", param='messages')
+
# Chat Completions
object_type = 'chat.completion' if not stream else 'chat.completion.chunk'
created_time = int(time.time())
@@ -336,9 +374,26 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
prompt_str = 'context' if is_legacy else 'prompt'
- # ... encoded as a string, array of strings, array of tokens, or array of token arrays.
- if prompt_str not in body:
- raise InvalidRequestError("Missing required input", param=prompt_str)
+ # Handle both prompt and messages format for unified multimodal support
+ if prompt_str not in body or body[prompt_str] is None:
+ if 'messages' in body:
+ # Convert messages format to prompt for completions endpoint
+ prompt_text = ""
+ for message in body.get('messages', []):
+ if isinstance(message, dict) and 'content' in message:
+ # Extract text content from multimodal messages
+ content = message['content']
+ if isinstance(content, str):
+ prompt_text += content
+ elif isinstance(content, list):
+ for item in content:
+ if isinstance(item, dict) and item.get('type') == 'text':
+ prompt_text += item.get('text', '')
+
+ # Allow empty prompts for image-only requests
+ body[prompt_str] = prompt_text
+ else:
+ raise InvalidRequestError("Missing required input", param=prompt_str)
# common params
generate_params = process_parameters(body, is_legacy=is_legacy)
@@ -349,9 +404,18 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
suffix = body['suffix'] if body['suffix'] else ''
echo = body['echo']
+ # Add messages to generate_params if present for multimodal processing
+ if 'messages' in body:
+ generate_params['messages'] = body['messages']
+
if not stream:
prompt_arg = body[prompt_str]
- if isinstance(prompt_arg, str) or (isinstance(prompt_arg, list) and isinstance(prompt_arg[0], int)):
+
+ # Handle empty/None prompts (e.g., image-only requests)
+ if prompt_arg is None:
+ prompt_arg = ""
+
+ if isinstance(prompt_arg, str) or (isinstance(prompt_arg, list) and len(prompt_arg) > 0 and isinstance(prompt_arg[0], int)):
prompt_arg = [prompt_arg]
resp_list_data = []
@@ -374,7 +438,19 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
# generate reply #######################################
debug_msg({'prompt': prompt, 'generate_params': generate_params})
- generator = generate_reply(prompt, generate_params, is_chat=False)
+
+ # Use multimodal generation if images are present
+ if 'messages' in generate_params:
+ raw_images = convert_openai_messages_to_images(generate_params['messages'])
+ if raw_images:
+ logger.info(f"Using multimodal generation for {len(raw_images)} images")
+ generate_params['raw_images'] = raw_images
+ generator = shared.model.generate_with_streaming(prompt, generate_params)
+ else:
+ generator = generate_reply(prompt, generate_params, is_chat=False)
+ else:
+ generator = generate_reply(prompt, generate_params, is_chat=False)
+
answer = ''
for a in generator:
@@ -447,7 +523,17 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
# generate reply #######################################
debug_msg({'prompt': prompt, 'generate_params': generate_params})
- generator = generate_reply(prompt, generate_params, is_chat=False)
+ # Use multimodal generation if images are present
+ if 'messages' in generate_params:
+ raw_images = convert_openai_messages_to_images(generate_params['messages'])
+ if raw_images:
+ logger.info(f"Using multimodal generation for {len(raw_images)} images")
+ generate_params['raw_images'] = raw_images
+ generator = shared.model.generate_with_streaming(prompt, generate_params)
+ else:
+ generator = generate_reply(prompt, generate_params, is_chat=False)
+ else:
+ generator = generate_reply(prompt, generate_params, is_chat=False)
answer = ''
seen_content = ''
diff --git a/extensions/openai/image_utils.py b/extensions/openai/image_utils.py
new file mode 100644
index 00000000..c54f0532
--- /dev/null
+++ b/extensions/openai/image_utils.py
@@ -0,0 +1,97 @@
+"""
+Shared image processing utilities for multimodal support.
+Used by both ExLlamaV3 and llama.cpp implementations.
+"""
+import base64
+import io
+from typing import Any, List, Tuple
+
+from PIL import Image
+
+from modules.logging_colors import logger
+
+
+def decode_base64_image(base64_string: str) -> Image.Image:
+ """Decodes a base64 string to a PIL Image."""
+ try:
+ if base64_string.startswith('data:image/'):
+ base64_string = base64_string.split(',', 1)[1]
+
+ image_data = base64.b64decode(base64_string)
+ image = Image.open(io.BytesIO(image_data))
+ return image
+ except Exception as e:
+ logger.error(f"Failed to decode base64 image: {e}")
+ raise ValueError(f"Invalid base64 image data: {e}")
+
+
+def process_message_content(content: Any) -> Tuple[str, List[Image.Image]]:
+ """
+ Processes message content that may contain text and images.
+ Returns: A tuple of (text_content, list_of_pil_images).
+ """
+ if isinstance(content, str):
+ return content, []
+
+ if isinstance(content, list):
+ text_parts = []
+ images = []
+ for item in content:
+ if not isinstance(item, dict):
+ continue
+
+ item_type = item.get('type', '')
+ if item_type == 'text':
+ text_parts.append(item.get('text', ''))
+ elif item_type == 'image_url':
+ image_url_data = item.get('image_url', {})
+ image_url = image_url_data.get('url', '')
+
+ if image_url.startswith('data:image/'):
+ try:
+ images.append(decode_base64_image(image_url))
+ except Exception as e:
+ logger.warning(f"Failed to process a base64 image: {e}")
+ elif image_url.startswith('http'):
+ # Support external URLs
+ try:
+ import requests
+ response = requests.get(image_url, timeout=10)
+ response.raise_for_status()
+ image_data = response.content
+ image = Image.open(io.BytesIO(image_data))
+ images.append(image)
+ logger.info("Successfully loaded external image from URL")
+ except Exception as e:
+ logger.warning(f"Failed to fetch external image: {e}")
+ else:
+ logger.warning(f"Unsupported image URL format: {image_url[:70]}...")
+
+ return ' '.join(text_parts), images
+
+ return str(content), []
+
+
+def convert_image_attachments_to_pil(image_attachments: List[dict]) -> List[Image.Image]:
+ """Convert webui image_attachments format to PIL Images."""
+ pil_images = []
+ for attachment in image_attachments:
+ if attachment.get('type') == 'image' and 'image_data' in attachment:
+ try:
+ image = decode_base64_image(attachment['image_data'])
+ if image.mode != 'RGB':
+ image = image.convert('RGB')
+ pil_images.append(image)
+ except Exception as e:
+ logger.warning(f"Failed to process image attachment: {e}")
+ return pil_images
+
+
+def convert_openai_messages_to_images(messages: List[dict]) -> List[Image.Image]:
+ """Convert OpenAI messages format to PIL Images."""
+ all_images = []
+ for message in messages:
+ if isinstance(message, dict) and 'content' in message:
+ _, images = process_message_content(message['content'])
+ all_images.extend(images)
+ return all_images
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 6bd3749f..e9f92da5 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -2,7 +2,7 @@ import json
import time
from typing import Dict, List, Optional
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, field_validator, validator
class GenerationOptions(BaseModel):
@@ -99,7 +99,8 @@ class ToolCall(BaseModel):
class CompletionRequestParams(BaseModel):
model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
- prompt: str | List[str]
+ prompt: str | List[str] | None = Field(default=None, description="Text prompt for completion. Can also use 'messages' format for multimodal.")
+ messages: List[dict] | None = Field(default=None, description="OpenAI messages format for multimodal support. Alternative to 'prompt'.")
best_of: int | None = Field(default=1, description="Unused parameter.")
echo: bool | None = False
frequency_penalty: float | None = 0
@@ -115,6 +116,17 @@ class CompletionRequestParams(BaseModel):
top_p: float | None = 1
user: str | None = Field(default=None, description="Unused parameter.")
+ @field_validator('prompt', 'messages')
+ @classmethod
+ def validate_prompt_or_messages(cls, v, info):
+ """Ensure either 'prompt' or 'messages' is provided for completions."""
+ if info.field_name == 'prompt': # If we're validating 'prompt', check if neither prompt nor messages will be set
+ messages = info.data.get('messages')
+ if v is None and messages is None:
+ raise ValueError("Either 'prompt' or 'messages' must be provided")
+
+ return v
+
class CompletionRequest(GenerationOptions, CompletionRequestParams):
pass
diff --git a/modules/chat.py b/modules/chat.py
index 1ab91b5e..354ae46b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -271,16 +271,27 @@ def generate_chat_prompt(user_input, state, **kwargs):
# Add attachment content if present AND if past attachments are enabled
if (state.get('include_past_attachments', True) and user_key in metadata and "attachments" in metadata[user_key]):
attachments_text = ""
- for attachment in metadata[user_key]["attachments"]:
- filename = attachment.get("name", "file")
- content = attachment.get("content", "")
- if attachment.get("type") == "text/html" and attachment.get("url"):
- attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
- else:
- attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+ image_refs = ""
- if attachments_text:
- enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}"
+ for attachment in metadata[user_key]["attachments"]:
+ if attachment.get("type") == "image":
+ # Add image reference for multimodal models
+ image_refs += "<__media__>"
+ else:
+ # Handle text/PDF attachments
+ filename = attachment.get("name", "file")
+ content = attachment.get("content", "")
+ if attachment.get("type") == "text/html" and attachment.get("url"):
+ attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
+ else:
+ attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+ if image_refs or attachments_text:
+ enhanced_user_msg = user_msg
+ if image_refs:
+ enhanced_user_msg += f" {image_refs}"
+ if attachments_text:
+ enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
@@ -301,16 +312,23 @@ def generate_chat_prompt(user_input, state, **kwargs):
if user_key in metadata and "attachments" in metadata[user_key]:
attachments_text = ""
- for attachment in metadata[user_key]["attachments"]:
- filename = attachment.get("name", "file")
- content = attachment.get("content", "")
- if attachment.get("type") == "text/html" and attachment.get("url"):
- attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
- else:
- attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+ image_refs = ""
- if attachments_text:
- user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}"
+ for attachment in metadata[user_key]["attachments"]:
+ if attachment.get("type") == "image":
+ image_refs += "<__media__>"
+ else:
+ filename = attachment.get("name", "file")
+ content = attachment.get("content", "")
+ if attachment.get("type") == "text/html" and attachment.get("url"):
+ attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
+ else:
+ attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+ if image_refs or attachments_text:
+ user_input = f"{user_input} {image_refs}"
+ if attachments_text:
+ user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
messages.append({"role": "user", "content": user_input})
@@ -594,29 +612,64 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
file_extension = path.suffix.lower()
try:
- # Handle different file types
- if file_extension == '.pdf':
+ # Handle image files
+ if file_extension in ['.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif']:
+ # Convert image to base64
+ with open(path, 'rb') as f:
+ image_data = base64.b64encode(f.read()).decode('utf-8')
+
+ # Determine MIME type from extension
+ mime_type_map = {
+ '.jpg': 'image/jpeg',
+ '.jpeg': 'image/jpeg',
+ '.png': 'image/png',
+ '.webp': 'image/webp',
+ '.bmp': 'image/bmp',
+ '.gif': 'image/gif'
+ }
+ mime_type = mime_type_map.get(file_extension, 'image/jpeg')
+
+ # Format as data URL
+ data_url = f"data:{mime_type};base64,{image_data}"
+
+ # Generate unique image ID
+ image_id = len([att for att in history['metadata'][key]["attachments"] if att.get("type") == "image"]) + 1
+
+ attachment = {
+ "name": filename,
+ "type": "image",
+ "image_data": data_url,
+ "image_id": image_id,
+ "file_path": str(path) # For UI preview
+ }
+ elif file_extension == '.pdf':
# Process PDF file
content = extract_pdf_text(path)
- file_type = "application/pdf"
+ attachment = {
+ "name": filename,
+ "type": "application/pdf",
+ "content": content,
+ }
elif file_extension == '.docx':
content = extract_docx_text(path)
- file_type = "application/docx"
+ attachment = {
+ "name": filename,
+ "type": "application/docx",
+ "content": content,
+ }
else:
# Default handling for text files
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
- file_type = "text/plain"
- # Add attachment
- attachment = {
- "name": filename,
- "type": file_type,
- "content": content,
- }
+ attachment = {
+ "name": filename,
+ "type": "text/plain",
+ "content": content,
+ }
history['metadata'][key]["attachments"].append(attachment)
- return content # Return the content for reuse
+ return attachment # Return the attachment for reuse
except Exception as e:
logger.error(f"Error processing attachment {filename}: {e}")
return None
@@ -759,6 +812,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
for file_path in files:
add_message_attachment(output, row_idx, file_path, is_user=True)
+ # Collect image attachments for ExLlamaV3
+ image_attachments = []
+ if 'metadata' in output:
+ user_key = f"user_{row_idx}"
+ if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
+ for attachment in output['metadata'][user_key]["attachments"]:
+ if attachment.get("type") == "image":
+ image_attachments.append(attachment)
+
+ # Add image attachments to state for the generation
+ if image_attachments:
+ state['image_attachments'] = image_attachments
+
# Add web search results as attachments if enabled
if state.get('enable_web_search', False):
search_query = generate_search_query(text, state)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
new file mode 100644
index 00000000..c2532ec3
--- /dev/null
+++ b/modules/exllamav3.py
@@ -0,0 +1,313 @@
+import traceback
+from pathlib import Path
+from typing import Any, List, Tuple
+
+from exllamav3 import Cache, Config, Generator, Model, Tokenizer
+from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
+
+from extensions.openai.image_utils import (
+ convert_image_attachments_to_pil,
+ convert_openai_messages_to_images
+)
+from modules import shared
+from modules.logging_colors import logger
+
+try:
+ import flash_attn
+except Exception:
+ logger.warning('Failed to load flash-attention due to the following error:\n')
+ traceback.print_exc()
+
+
+class Exllamav3Model:
+ def __init__(self):
+ pass
+
+ @classmethod
+ def from_pretrained(cls, path_to_model):
+ path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
+
+ # Reset global MMTokenAllocator to prevent token ID corruption when switching models
+ from exllamav3.tokenizer.mm_embedding import (
+ FIRST_MM_EMBEDDING_INDEX,
+ global_allocator
+ )
+ global_allocator.next_token_index = FIRST_MM_EMBEDDING_INDEX
+ logger.info("Reset MMTokenAllocator for clean multimodal token allocation")
+
+ config = Config.from_directory(str(path_to_model))
+ model = Model.from_config(config)
+
+ # Calculate the closest multiple of 256 at or above the chosen value
+ max_tokens = shared.args.ctx_size
+ if max_tokens % 256 != 0:
+ adjusted_tokens = ((max_tokens // 256) + 1) * 256
+ logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}")
+ max_tokens = adjusted_tokens
+
+ # Parse cache type (ExLlamaV2 pattern)
+ cache_type = shared.args.cache_type.lower()
+ cache_kwargs = {}
+ if cache_type == 'fp16':
+ layer_type = CacheLayer_fp16
+ elif cache_type.startswith('q'):
+ layer_type = CacheLayer_quant
+ if '_' in cache_type:
+ # Different bits for k and v (e.g., q4_q8)
+ k_part, v_part = cache_type.split('_')
+ k_bits = int(k_part[1:])
+ v_bits = int(v_part[1:])
+ else:
+ # Same bits for k and v (e.g., q4)
+ k_bits = v_bits = int(cache_type[1:])
+
+ # Validate bit ranges
+ if not (2 <= k_bits <= 8 and 2 <= v_bits <= 8):
+ logger.warning(f"Invalid quantization bits: k_bits={k_bits}, v_bits={v_bits}. Must be between 2 and 8. Falling back to fp16.")
+ layer_type = CacheLayer_fp16
+ else:
+ cache_kwargs = {'k_bits': k_bits, 'v_bits': v_bits}
+ else:
+ logger.warning(f"Unrecognized cache type: {cache_type}. Falling back to fp16.")
+ layer_type = CacheLayer_fp16
+
+ cache = Cache(model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs)
+
+ load_params = {'progressbar': True}
+ if shared.args.gpu_split:
+ split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+ load_params['use_per_device'] = split
+
+ model.load(**load_params)
+
+ tokenizer = Tokenizer.from_config(config)
+
+ # Load vision model component (ExLlamaV3 native)
+ vision_model = None
+ try:
+ logger.info("Loading vision model component...")
+ vision_model = Model.from_config(config, component="vision")
+ vision_model.load(progressbar=True)
+ logger.info("Vision model loaded successfully")
+ except Exception as e:
+ logger.warning(f"Vision model loading failed (multimodal disabled): {e}")
+
+ generator = Generator(
+ model=model,
+ cache=cache,
+ tokenizer=tokenizer,
+ )
+
+ result = cls()
+ result.model = model
+ result.cache = cache
+ result.tokenizer = tokenizer
+ result.generator = generator
+ result.config = config
+ result.max_tokens = max_tokens
+ result.vision_model = vision_model
+
+ return result
+
+ def is_multimodal(self) -> bool:
+ """Check if this model supports multimodal input."""
+ return hasattr(self, 'vision_model') and self.vision_model is not None
+
+ def _process_images_for_generation(self, prompt: str, state: dict) -> Tuple[str, List[Any]]:
+ """
+ Process all possible image inputs and return modified prompt + embeddings.
+ Returns: (processed_prompt, image_embeddings)
+ """
+ if not self.is_multimodal():
+ return prompt, []
+
+ # Collect images from various sources using shared utilities
+ pil_images = []
+
+ # From webui image_attachments (preferred format)
+ if 'image_attachments' in state and state['image_attachments']:
+ pil_images.extend(convert_image_attachments_to_pil(state['image_attachments']))
+
+ # From OpenAI API raw_images
+ elif 'raw_images' in state and state['raw_images']:
+ pil_images.extend(state['raw_images'])
+
+ # From OpenAI API messages format
+ elif 'messages' in state and state['messages']:
+ pil_images.extend(convert_openai_messages_to_images(state['messages']))
+
+ if not pil_images:
+ return prompt, []
+
+ # ExLlamaV3-specific: Generate embeddings
+ try:
+ # Use pre-computed embeddings if available (proper MMEmbedding lifetime)
+ if 'image_embeddings' in state and state['image_embeddings']:
+ # Use existing embeddings - this preserves MMEmbedding lifetime
+ image_embeddings = state['image_embeddings']
+ else:
+ # Do not reset the cache/allocator index; it causes token ID conflicts during generation.
+
+ logger.info(f"Processing {len(pil_images)} image(s) with ExLlamaV3 vision model")
+ image_embeddings = [
+ self.vision_model.get_image_embeddings(tokenizer=self.tokenizer, image=img)
+ for img in pil_images
+ ]
+
+ # ExLlamaV3-specific: Handle prompt processing with placeholders
+ placeholders = [ie.text_alias for ie in image_embeddings]
+
+ if '<__media__>' in prompt:
+ # Web chat: Replace <__media__> placeholders
+ for alias in placeholders:
+ prompt = prompt.replace('<__media__>', alias, 1)
+ logger.info(f"Replaced {len(placeholders)} <__media__> placeholder(s)")
+ else:
+ # API: Prepend embedding aliases
+ combined_placeholders = "\n".join(placeholders)
+ prompt = combined_placeholders + "\n" + prompt
+ logger.info(f"Prepended {len(placeholders)} embedding(s) to prompt")
+
+ return prompt, image_embeddings
+
+ except Exception as e:
+ logger.error(f"Failed to process images: {e}")
+ return prompt, []
+
+ def generate_with_streaming(self, prompt, state):
+ """
+ Generate text with streaming using native ExLlamaV3 API
+ """
+ from exllamav3 import Job
+ from exllamav3.generator.sampler.presets import ComboSampler
+
+ # Process images and modify prompt (ExLlamaV3-specific)
+ prompt, image_embeddings = self._process_images_for_generation(prompt, state)
+
+ sampler = ComboSampler(
+ rep_p=state.get('repetition_penalty', 1.0),
+ freq_p=state.get('frequency_penalty', 0.0),
+ pres_p=state.get('presence_penalty', 0.0),
+ temperature=state.get('temperature', 0.7),
+ min_p=state.get('min_p', 0.0),
+ top_k=state.get('top_k', 0),
+ top_p=state.get('top_p', 1.0),
+ )
+
+ # Encode prompt with embeddings (ExLlamaV3-specific)
+ if image_embeddings:
+ input_ids = self.tokenizer.encode(
+ prompt,
+ encode_special_tokens=True,
+ embeddings=image_embeddings,
+ )
+ else:
+ input_ids = self.tokenizer.encode(prompt, encode_special_tokens=True)
+
+ # Get stop conditions from state (webui format) - keep as strings like ExLlamaV3 examples
+ stop_conditions = []
+ if 'stopping_strings' in state and state['stopping_strings']:
+ # Use strings directly (ExLlamaV3 handles the conversion internally)
+ stop_conditions.extend(state['stopping_strings'])
+
+ # Add EOS token ID as ExLlamaV3 examples do
+ if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
+ stop_conditions.append(self.tokenizer.eos_token_id)
+
+ job = Job(
+ input_ids=input_ids,
+ max_new_tokens=state.get('max_new_tokens', 500),
+ decode_special_tokens=True,
+ embeddings=image_embeddings if image_embeddings else None,
+ sampler=sampler,
+ stop_conditions=stop_conditions if stop_conditions else None,
+ )
+
+ # Stream generation
+ self.generator.enqueue(job)
+
+ response_text = ""
+ try:
+ while self.generator.num_remaining_jobs():
+ results = self.generator.iterate()
+ for result in results:
+ if "eos" in result and result["eos"]:
+ break
+
+ chunk = result.get("text", "")
+ if chunk:
+ response_text += chunk
+ yield response_text
+ finally:
+ # No cleanup needed. MMEmbedding lifetime is managed by Python.
+ # Cache and page table resets are unnecessary and can cause token ID conflicts.
+ pass
+
+ def generate(self, prompt, state):
+ """
+ Generate text using native ExLlamaV3 API (non-streaming)
+ """
+ output = self.generator.generate(
+ prompt=prompt,
+ max_new_tokens=state.get('max_new_tokens', 500),
+ temperature=state.get('temperature', 0.7),
+ top_p=state.get('top_p', 1.0),
+ top_k=state.get('top_k', 0),
+ repetition_penalty=state.get('repetition_penalty', 1.0),
+ frequency_penalty=state.get('frequency_penalty', 0.0),
+ presence_penalty=state.get('presence_penalty', 0.0),
+ min_p=state.get('min_p', 0.0),
+ )
+
+ return output
+
+ def encode(self, string, **kwargs):
+ return self.tokenizer.encode(string, **kwargs)
+
+ def decode(self, ids, **kwargs):
+ return self.tokenizer.decode(ids, **kwargs)
+
+ @property
+ def last_prompt_token_count(self):
+ # This would need to be tracked during generation
+ return 0
+
+ def unload(self):
+ logger.info("Unloading ExLlamaV3 model components...")
+
+ if hasattr(self, 'vision_model') and self.vision_model is not None:
+ try:
+ del self.vision_model
+ except Exception as e:
+ logger.warning(f"Error unloading vision model: {e}")
+ self.vision_model = None
+
+ if hasattr(self, 'model') and self.model is not None:
+ try:
+ self.model.unload()
+ del self.model
+ except Exception as e:
+ logger.warning(f"Error unloading main model: {e}")
+ self.model = None
+
+ if hasattr(self, 'cache') and self.cache is not None:
+ self.cache = None
+
+ if hasattr(self, 'generator') and self.generator is not None:
+ self.generator = None
+
+ if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+ self.tokenizer = None
+
+ # Force GPU memory cleanup
+ import gc
+
+ import torch
+ gc.collect()
+
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+ torch.cuda.empty_cache()
+
+ logger.info("ExLlamaV3 model fully unloaded")
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 79237f7f..63a0cdd0 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -406,16 +406,27 @@ def format_message_attachments(history, role, index):
for attachment in attachments:
name = html.escape(attachment["name"])
- # Make clickable if URL exists
- if "url" in attachment:
- name = f'{name}'
+ if attachment.get("type") == "image":
+ # Show image preview
+ file_path = attachment.get("file_path", "")
+ attachments_html += (
+ f'
'
+ f'

'
+ f'
{name}
'
+ f'
'
+ )
+ else:
+ # Make clickable if URL exists (web search)
+ if "url" in attachment:
+ name = f'{name}'
+
+ attachments_html += (
+ f''
+ f'
{attachment_svg}
'
+ f'
{name}
'
+ f'
'
+ )
- attachments_html += (
- f''
- f'
{attachment_svg}
'
- f'
{name}
'
- f'
'
- )
attachments_html += ''
return attachments_html
diff --git a/modules/loaders.py b/modules/loaders.py
index 7546bc5b..e9437c16 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -55,6 +55,11 @@ loaders_and_params = OrderedDict({
'trust_remote_code',
'no_use_fast',
],
+ 'ExLlamav3': [
+ 'ctx_size',
+ 'cache_type',
+ 'gpu_split',
+ ],
'ExLlamav2_HF': [
'ctx_size',
'cache_type',
@@ -248,6 +253,41 @@ loaders_samplers = {
'grammar_string',
'grammar_file_row',
},
+ 'ExLlamav3': {
+ 'temperature',
+ 'dynatemp_low',
+ 'dynatemp_high',
+ 'dynatemp_exponent',
+ 'smoothing_factor',
+ 'min_p',
+ 'top_p',
+ 'top_k',
+ 'typical_p',
+ 'xtc_threshold',
+ 'xtc_probability',
+ 'tfs',
+ 'top_a',
+ 'dry_multiplier',
+ 'dry_allowed_length',
+ 'dry_base',
+ 'repetition_penalty',
+ 'frequency_penalty',
+ 'presence_penalty',
+ 'repetition_penalty_range',
+ 'mirostat_mode',
+ 'mirostat_tau',
+ 'mirostat_eta',
+ 'dynamic_temperature',
+ 'temperature_last',
+ 'auto_max_new_tokens',
+ 'ban_eos_token',
+ 'add_bos_token',
+ 'enable_thinking',
+ 'skip_special_tokens',
+ 'seed',
+ 'custom_token_bans',
+ 'dry_sequence_breakers',
+ },
'ExLlamav2': {
'temperature',
'dynatemp_low',
diff --git a/modules/models.py b/modules/models.py
index c1e7fb56..cc500a40 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -19,6 +19,7 @@ def load_model(model_name, loader=None):
'llama.cpp': llama_cpp_server_loader,
'Transformers': transformers_loader,
'ExLlamav3_HF': ExLlamav3_HF_loader,
+ 'ExLlamav3': ExLlamav3_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ExLlamav2': ExLlamav2_loader,
'TensorRT-LLM': TensorRT_LLM_loader,
@@ -88,6 +89,14 @@ def ExLlamav3_HF_loader(model_name):
return Exllamav3HF.from_pretrained(model_name)
+def ExLlamav3_loader(model_name):
+ from modules.exllamav3 import Exllamav3Model
+
+ model = Exllamav3Model.from_pretrained(model_name)
+ tokenizer = model.tokenizer
+ return model, tokenizer
+
+
def ExLlamav2_HF_loader(model_name):
from modules.exllamav2_hf import Exllamav2HF
@@ -116,7 +125,9 @@ def unload_model(keep_model_name=False):
return
is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
- if shared.model.__class__.__name__ == 'Exllamav3HF':
+ if shared.args.loader in ['ExLlamav3_HF', 'ExLlamav3']:
+ shared.model.unload()
+ elif shared.args.loader in ['ExLlamav2_HF', 'ExLlamav2'] and hasattr(shared.model, 'unload'):
shared.model.unload()
shared.model = shared.tokenizer = None
diff --git a/modules/shared.py b/modules/shared.py
index ab5198d1..1de4306b 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -318,6 +318,8 @@ def fix_loader_name(name):
return 'ExLlamav2_HF'
elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']:
return 'ExLlamav3_HF'
+ elif name in ['exllamav3']:
+ return 'ExLlamav3'
elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
return 'TensorRT-LLM'
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 8d1950b9..d6a87ce8 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -40,7 +40,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
yield ''
return
- if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel']:
+ if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']:
generate_func = generate_reply_custom
else:
generate_func = generate_reply_HF
@@ -128,9 +128,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
from modules.torch_utils import get_device
- if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel']:
+ if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']:
input_ids = shared.tokenizer.encode(str(prompt))
- if shared.model.__class__.__name__ != 'Exllamav2Model':
+ if shared.model.__class__.__name__ not in ['Exllamav2Model', 'Exllamav3Model']:
input_ids = np.array(input_ids).reshape(1, len(input_ids))
else:
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
@@ -148,7 +148,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]
- if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
+ if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel'] or shared.args.cpu:
return input_ids
else:
device = get_device()
@@ -295,6 +295,8 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
_StopEverythingStoppingCriteria
)
+ # Native ExLlamav3Model handles multimodal internally - no special routing needed
+
if shared.args.loader == 'Transformers':
clear_torch_cache()
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 1d85a398..3b922fb4 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -54,7 +54,7 @@ def create_ui():
gr.HTML(value='☰', elem_id='gr-hover')
with gr.Column(scale=10, elem_id='chat-input-container'):
- shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
+ shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf', 'image'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
shared.gradio['typing-dots'] = gr.HTML(value='
', label='typing', elem_id='typing-container')
with gr.Column(scale=1, elem_id='generate-stop-container'):
From 6e9de75727ace45b3bf71ea3a98ef350b6d7414d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Aug 2025 19:35:09 -0700
Subject: [PATCH 03/79] Support loading chat templates from chat_template.json
files
---
modules/models_settings.py | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 4e53dc81..729d5dd1 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -139,16 +139,26 @@ def get_model_metadata(model):
with open(jinja_path, 'r', encoding='utf-8') as f:
template = f.read()
+ # 2. If no .jinja file, try chat_template.json
+ if template is None:
+ json_template_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.json'
+ if json_template_path.exists():
+ with open(json_template_path, 'r', encoding='utf-8') as f:
+ json_data = json.load(f)
+ if 'chat_template' in json_data:
+ template = json_data['chat_template']
+
+ # 3. Fall back to tokenizer_config.json metadata
if path.exists():
metadata = json.loads(open(path, 'r', encoding='utf-8').read())
- # 2. Only read from metadata if we haven't already loaded from .jinja
+ # Only read from metadata if we haven't already loaded from .jinja or .json
if template is None and 'chat_template' in metadata:
template = metadata['chat_template']
if isinstance(template, list):
template = template[0]['template']
- # 3. If a template was found from either source, process it
+ # 4. If a template was found from any source, process it
if template:
for k in ['eos_token', 'bos_token']:
if k in metadata:
From 8fcadff8d3120d1f3e844cd030d59a8c2b0b2dfd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Aug 2025 20:13:54 -0700
Subject: [PATCH 04/79] mtmd: Use the base64 attachment for the UI preview
instead of the file
---
modules/chat.py | 1 -
modules/html_generator.py | 5 ++---
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 354ae46b..98800239 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -640,7 +640,6 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
"type": "image",
"image_data": data_url,
"image_id": image_id,
- "file_path": str(path) # For UI preview
}
elif file_extension == '.pdf':
# Process PDF file
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 63a0cdd0..cb14a722 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -407,11 +407,10 @@ def format_message_attachments(history, role, index):
name = html.escape(attachment["name"])
if attachment.get("type") == "image":
- # Show image preview
- file_path = attachment.get("file_path", "")
+ image_data = attachment.get("image_data", "")
attachments_html += (
f'
'
- f'

'
+ f'

'
f'
{name}
'
f'
'
)
From 544c3a7c9f305b6a2141c3d02770250058d43322 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Aug 2025 21:15:53 -0700
Subject: [PATCH 05/79] Polish the new exllamav3 loader
---
modules/exllamav3.py | 152 +++++++++++++++++++++++++++++--------------
modules/loaders.py | 21 +-----
2 files changed, 104 insertions(+), 69 deletions(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index c2532ec3..295c2737 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -2,8 +2,22 @@ import traceback
from pathlib import Path
from typing import Any, List, Tuple
+import torch
from exllamav3 import Cache, Config, Generator, Model, Tokenizer
from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
+from exllamav3.generator import Job
+# Import the base sampler components directly from exllamav3
+from exllamav3.generator.sampler import (
+ CustomSampler,
+ SS_Argmax,
+ SS_MinP,
+ SS_PresFreqP,
+ SS_RepP,
+ SS_Sample,
+ SS_Temperature,
+ SS_TopK,
+ SS_TopP
+)
from extensions.openai.image_utils import (
convert_image_attachments_to_pil,
@@ -11,6 +25,7 @@ from extensions.openai.image_utils import (
)
from modules import shared
from modules.logging_colors import logger
+from modules.text_generation import get_max_prompt_length
try:
import flash_attn
@@ -79,7 +94,6 @@ class Exllamav3Model:
load_params['use_per_device'] = split
model.load(**load_params)
-
tokenizer = Tokenizer.from_config(config)
# Load vision model component (ExLlamaV3 native)
@@ -127,11 +141,9 @@ class Exllamav3Model:
# From webui image_attachments (preferred format)
if 'image_attachments' in state and state['image_attachments']:
pil_images.extend(convert_image_attachments_to_pil(state['image_attachments']))
-
# From OpenAI API raw_images
elif 'raw_images' in state and state['raw_images']:
pil_images.extend(state['raw_images'])
-
# From OpenAI API messages format
elif 'messages' in state and state['messages']:
pil_images.extend(convert_openai_messages_to_images(state['messages']))
@@ -147,7 +159,6 @@ class Exllamav3Model:
image_embeddings = state['image_embeddings']
else:
# Do not reset the cache/allocator index; it causes token ID conflicts during generation.
-
logger.info(f"Processing {len(pil_images)} image(s) with ExLlamaV3 vision model")
image_embeddings = [
self.vision_model.get_image_embeddings(tokenizer=self.tokenizer, image=img)
@@ -178,46 +189,98 @@ class Exllamav3Model:
"""
Generate text with streaming using native ExLlamaV3 API
"""
- from exllamav3 import Job
- from exllamav3.generator.sampler.presets import ComboSampler
-
# Process images and modify prompt (ExLlamaV3-specific)
prompt, image_embeddings = self._process_images_for_generation(prompt, state)
- sampler = ComboSampler(
- rep_p=state.get('repetition_penalty', 1.0),
- freq_p=state.get('frequency_penalty', 0.0),
- pres_p=state.get('presence_penalty', 0.0),
- temperature=state.get('temperature', 0.7),
- min_p=state.get('min_p', 0.0),
- top_k=state.get('top_k', 0),
- top_p=state.get('top_p', 1.0),
- )
+ # -- Manually build and sort the sampler stack --
+ # Greedy decoding is a special case
+ if state['temperature'] == 0:
+ sampler = CustomSampler([SS_Argmax()])
+ else:
+ # 1. Create a list of all active, unordered samplers
+ unordered_samplers = []
+
+ # Penalties
+ penalty_range = state['repetition_penalty_range']
+ if penalty_range <= 0:
+ penalty_range = -1 # ExllamaV3 uses -1 for whole context
+ rep_decay = 0 # Not a configurable parameter
+
+ # Add penalty samplers if they are active
+ if state['repetition_penalty'] != 1.0:
+ unordered_samplers.append(SS_RepP(state['repetition_penalty'], penalty_range, rep_decay))
+ if state['presence_penalty'] != 0.0 or state['frequency_penalty'] != 0.0:
+ unordered_samplers.append(SS_PresFreqP(state['presence_penalty'], state['frequency_penalty'], penalty_range, rep_decay))
+
+ # Standard samplers
+ if state['top_k'] > 0:
+ unordered_samplers.append(SS_TopK(state['top_k']))
+ if state['top_p'] < 1.0:
+ unordered_samplers.append(SS_TopP(state['top_p']))
+ if state['min_p'] > 0.0:
+ unordered_samplers.append(SS_MinP(state['min_p']))
+
+ # Temperature
+ unordered_samplers.append(SS_Temperature(state['temperature']))
+
+ # 2. Define the mapping from class names to the priority list keys
+ class_name_to_nickname = {
+ 'SS_RepP': 'repetition_penalty',
+ 'SS_PresFreqP': 'presence_frequency_penalty',
+ 'SS_TopK': 'top_k',
+ 'SS_TopP': 'top_p',
+ 'SS_MinP': 'min_p',
+ 'SS_Temperature': 'temperature',
+ }
+
+ # 3. Get the priority list and handle temperature_last
+ default_priority = ['repetition_penalty', 'presence_frequency_penalty', 'top_k', 'top_p', 'min_p', 'temperature']
+ sampler_priority = state.get('sampler_priority', default_priority)
+
+ if state['temperature_last'] and 'temperature' in sampler_priority:
+ sampler_priority.append(sampler_priority.pop(sampler_priority.index('temperature')))
+
+ # 4. Sort the unordered list based on the priority list
+ def custom_sort_key(sampler_obj):
+ class_name = sampler_obj.__class__.__name__
+ nickname = class_name_to_nickname.get(class_name)
+ if nickname in sampler_priority:
+ return sampler_priority.index(nickname)
+ return -1
+
+ ordered_samplers = sorted(unordered_samplers, key=custom_sort_key)
+
+ # 5. Add the final sampling stage and build the sampler
+ ordered_samplers.append(SS_Sample())
+ sampler = CustomSampler(ordered_samplers)
+ # -- End of sampler building --
# Encode prompt with embeddings (ExLlamaV3-specific)
- if image_embeddings:
- input_ids = self.tokenizer.encode(
- prompt,
- encode_special_tokens=True,
- embeddings=image_embeddings,
- )
+ input_ids = self.tokenizer.encode(
+ prompt,
+ add_bos=state['add_bos_token'],
+ encode_special_tokens=True,
+ embeddings=image_embeddings,
+ )
+
+ input_ids = input_ids[:, -get_max_prompt_length(state):]
+
+ # Determine max_new_tokens
+ if state['auto_max_new_tokens']:
+ max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
else:
- input_ids = self.tokenizer.encode(prompt, encode_special_tokens=True)
+ max_new_tokens = state['max_new_tokens']
- # Get stop conditions from state (webui format) - keep as strings like ExLlamaV3 examples
+ # Get stop conditions
stop_conditions = []
- if 'stopping_strings' in state and state['stopping_strings']:
- # Use strings directly (ExLlamaV3 handles the conversion internally)
- stop_conditions.extend(state['stopping_strings'])
-
- # Add EOS token ID as ExLlamaV3 examples do
- if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
- stop_conditions.append(self.tokenizer.eos_token_id)
+ if not state['ban_eos_token']:
+ if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
+ stop_conditions.append(self.tokenizer.eos_token_id)
job = Job(
input_ids=input_ids,
- max_new_tokens=state.get('max_new_tokens', 500),
- decode_special_tokens=True,
+ max_new_tokens=max_new_tokens,
+ decode_special_tokens=not state['skip_special_tokens'],
embeddings=image_embeddings if image_embeddings else None,
sampler=sampler,
stop_conditions=stop_conditions if stop_conditions else None,
@@ -244,25 +307,16 @@ class Exllamav3Model:
pass
def generate(self, prompt, state):
- """
- Generate text using native ExLlamaV3 API (non-streaming)
- """
- output = self.generator.generate(
- prompt=prompt,
- max_new_tokens=state.get('max_new_tokens', 500),
- temperature=state.get('temperature', 0.7),
- top_p=state.get('top_p', 1.0),
- top_k=state.get('top_k', 0),
- repetition_penalty=state.get('repetition_penalty', 1.0),
- frequency_penalty=state.get('frequency_penalty', 0.0),
- presence_penalty=state.get('presence_penalty', 0.0),
- min_p=state.get('min_p', 0.0),
- )
+ output = ""
+ for chunk in self.generate_with_streaming(prompt, state):
+ output = chunk
return output
def encode(self, string, **kwargs):
- return self.tokenizer.encode(string, **kwargs)
+ # Default add_bos to True for consistency with exllamav2 behavior
+ add_bos = kwargs.pop('add_bos', True)
+ return self.tokenizer.encode(string, add_bos=add_bos, **kwargs)
def decode(self, ids, **kwargs):
return self.tokenizer.decode(ids, **kwargs)
@@ -301,8 +355,6 @@ class Exllamav3Model:
# Force GPU memory cleanup
import gc
-
- import torch
gc.collect()
if torch.cuda.is_available():
diff --git a/modules/loaders.py b/modules/loaders.py
index e9437c16..151de990 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -255,38 +255,21 @@ loaders_samplers = {
},
'ExLlamav3': {
'temperature',
- 'dynatemp_low',
- 'dynatemp_high',
- 'dynatemp_exponent',
- 'smoothing_factor',
'min_p',
'top_p',
'top_k',
- 'typical_p',
- 'xtc_threshold',
- 'xtc_probability',
- 'tfs',
- 'top_a',
- 'dry_multiplier',
- 'dry_allowed_length',
- 'dry_base',
'repetition_penalty',
'frequency_penalty',
'presence_penalty',
'repetition_penalty_range',
- 'mirostat_mode',
- 'mirostat_tau',
- 'mirostat_eta',
- 'dynamic_temperature',
'temperature_last',
+ 'sampler_priority',
'auto_max_new_tokens',
'ban_eos_token',
'add_bos_token',
'enable_thinking',
- 'skip_special_tokens',
'seed',
- 'custom_token_bans',
- 'dry_sequence_breakers',
+ 'skip_special_tokens',
},
'ExLlamav2': {
'temperature',
From 9e260332cc9da24a407bb59aadf0cf6a9cf0d88c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Aug 2025 21:22:47 -0700
Subject: [PATCH 06/79] Remove some unnecessary code
---
modules/exllamav3.py | 13 ++-----------
1 file changed, 2 insertions(+), 11 deletions(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 295c2737..d616d2f5 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -26,6 +26,7 @@ from extensions.openai.image_utils import (
from modules import shared
from modules.logging_colors import logger
from modules.text_generation import get_max_prompt_length
+from modules.torch_utils import clear_torch_cache
try:
import flash_attn
@@ -342,6 +343,7 @@ class Exllamav3Model:
del self.model
except Exception as e:
logger.warning(f"Error unloading main model: {e}")
+
self.model = None
if hasattr(self, 'cache') and self.cache is not None:
@@ -352,14 +354,3 @@ class Exllamav3Model:
if hasattr(self, 'tokenizer') and self.tokenizer is not None:
self.tokenizer = None
-
- # Force GPU memory cleanup
- import gc
- gc.collect()
-
- if torch.cuda.is_available():
- torch.cuda.empty_cache()
- torch.cuda.synchronize()
- torch.cuda.empty_cache()
-
- logger.info("ExLlamaV3 model fully unloaded")
From 1168004067dbe37791e6911fc3ed3386d8131ce3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:01:55 -0700
Subject: [PATCH 07/79] Minor change
---
modules/exllamav3.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index d616d2f5..f6c56cb0 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -6,7 +6,6 @@ import torch
from exllamav3 import Cache, Config, Generator, Model, Tokenizer
from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
from exllamav3.generator import Job
-# Import the base sampler components directly from exllamav3
from exllamav3.generator.sampler import (
CustomSampler,
SS_Argmax,
From 3f5ec9644f5aec2045126cdc5d962ee6f0b44c14 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:06:07 -0700
Subject: [PATCH 08/79] mtmd: Place the image <__media__> at the top of the
prompt
---
modules/chat.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/chat.py b/modules/chat.py
index 98800239..0a03a084 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -289,7 +289,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
if image_refs or attachments_text:
enhanced_user_msg = user_msg
if image_refs:
- enhanced_user_msg += f" {image_refs}"
+ enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}"
if attachments_text:
enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
From d9db8f63a719f799bac8f05ed567a1ba38041a72 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:25:42 -0700
Subject: [PATCH 09/79] mtmd: Simplifications
---
extensions/openai/completions.py | 33 +++++++-------------------------
1 file changed, 7 insertions(+), 26 deletions(-)
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 3d389f0b..ff64527a 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -407,6 +407,10 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
# Add messages to generate_params if present for multimodal processing
if 'messages' in body:
generate_params['messages'] = body['messages']
+ raw_images = convert_openai_messages_to_images(generate_params['messages'])
+ if raw_images:
+ logger.info(f"Found {len(raw_images)} image(s) in request.")
+ generate_params['raw_images'] = raw_images
if not stream:
prompt_arg = body[prompt_str]
@@ -423,7 +427,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
total_prompt_token_count = 0
for idx, prompt in enumerate(prompt_arg, start=0):
- if isinstance(prompt[0], int):
+ if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], int):
# token lists
if requested_model == shared.model_name:
prompt = decode(prompt)[0]
@@ -438,19 +442,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
# generate reply #######################################
debug_msg({'prompt': prompt, 'generate_params': generate_params})
-
- # Use multimodal generation if images are present
- if 'messages' in generate_params:
- raw_images = convert_openai_messages_to_images(generate_params['messages'])
- if raw_images:
- logger.info(f"Using multimodal generation for {len(raw_images)} images")
- generate_params['raw_images'] = raw_images
- generator = shared.model.generate_with_streaming(prompt, generate_params)
- else:
- generator = generate_reply(prompt, generate_params, is_chat=False)
- else:
- generator = generate_reply(prompt, generate_params, is_chat=False)
-
+ generator = generate_reply(prompt, generate_params, is_chat=False)
answer = ''
for a in generator:
@@ -523,18 +515,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
# generate reply #######################################
debug_msg({'prompt': prompt, 'generate_params': generate_params})
- # Use multimodal generation if images are present
- if 'messages' in generate_params:
- raw_images = convert_openai_messages_to_images(generate_params['messages'])
- if raw_images:
- logger.info(f"Using multimodal generation for {len(raw_images)} images")
- generate_params['raw_images'] = raw_images
- generator = shared.model.generate_with_streaming(prompt, generate_params)
- else:
- generator = generate_reply(prompt, generate_params, is_chat=False)
- else:
- generator = generate_reply(prompt, generate_params, is_chat=False)
-
+ generator = generate_reply(prompt, generate_params, is_chat=False)
answer = ''
seen_content = ''
completion_token_count = 0
From fa9be444fa0b3e18763b1cd3d0dd07ad565ac1bb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:26:59 -0700
Subject: [PATCH 10/79] Use ExLlamav3 instead of ExLlamav3_HF by default for
EXL3 models
---
modules/models_settings.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 729d5dd1..d3bf4a36 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -211,11 +211,11 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
elif re.match(r'.*\.gguf', model_name.lower()):
loader = 'llama.cpp'
elif hf_quant_method == 'exl3':
- loader = 'ExLlamav3_HF'
+ loader = 'ExLlamav3'
elif hf_quant_method in ['exl2', 'gptq']:
loader = 'ExLlamav2_HF'
elif re.match(r'.*exl3', model_name.lower()):
- loader = 'ExLlamav3_HF'
+ loader = 'ExLlamav3'
elif re.match(r'.*exl2', model_name.lower()):
loader = 'ExLlamav2_HF'
else:
From f396b82a4f92f5823ed2a9bd1ff32d915da4cf9a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:31:36 -0700
Subject: [PATCH 11/79] mtmd: Better way to detect if an EXL3 model is
multimodal
---
modules/exllamav3.py | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index f6c56cb0..70f6c2f1 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -98,13 +98,16 @@ class Exllamav3Model:
# Load vision model component (ExLlamaV3 native)
vision_model = None
- try:
- logger.info("Loading vision model component...")
- vision_model = Model.from_config(config, component="vision")
- vision_model.load(progressbar=True)
- logger.info("Vision model loaded successfully")
- except Exception as e:
- logger.warning(f"Vision model loading failed (multimodal disabled): {e}")
+ if "vision_config" in config.config_dict:
+ logger.info("Vision component detected in model config. Attempting to load...")
+ try:
+ vision_model = Model.from_config(config, component="vision")
+ vision_model.load(progressbar=True)
+ logger.info("Vision model loaded successfully.")
+ except Exception as e:
+ logger.warning(f"Vision model loading failed (multimodal disabled): {e}")
+ else:
+ logger.info("No vision component in model config. Skipping multimodal setup.")
generator = Generator(
model=model,
From 59c6138e989861020816041821369fd9cd6b0ffa Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:32:15 -0700
Subject: [PATCH 12/79] Remove a log message
---
modules/exllamav3.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 70f6c2f1..bdb0c760 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -48,7 +48,6 @@ class Exllamav3Model:
global_allocator
)
global_allocator.next_token_index = FIRST_MM_EMBEDDING_INDEX
- logger.info("Reset MMTokenAllocator for clean multimodal token allocation")
config = Config.from_directory(str(path_to_model))
model = Model.from_config(config)
From 2fe79a93ccf21121db3dd076050df93f08c5bdb9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:50:24 -0700
Subject: [PATCH 13/79] mtmd: Handle another case after
3f5ec9644f5aec2045126cdc5d962ee6f0b44c14
---
modules/chat.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/modules/chat.py b/modules/chat.py
index 0a03a084..b127b489 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -326,7 +326,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
if image_refs or attachments_text:
- user_input = f"{user_input} {image_refs}"
+ user_input = user_input
+ if image_refs:
+ user_input = f"{image_refs}\n\n{user_input}"
if attachments_text:
user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
From a6d6bee88cf52dc4bbfaeac91df5a951810ab0dd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:51:03 -0700
Subject: [PATCH 14/79] Change a comment
---
modules/chat.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/chat.py b/modules/chat.py
index b127b489..639feebf 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -813,7 +813,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
for file_path in files:
add_message_attachment(output, row_idx, file_path, is_user=True)
- # Collect image attachments for ExLlamaV3
+ # Collect image attachments for multimodal generation
image_attachments = []
if 'metadata' in output:
user_key = f"user_{row_idx}"
From d489eb589a698e37540861fe2951ef66efdb772d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 14:10:41 -0700
Subject: [PATCH 15/79] Attempt at fixing new exllamav3 loader undefined
behavior when switching conversations
---
modules/exllamav3.py | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index bdb0c760..e3a2d95a 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -304,9 +304,7 @@ class Exllamav3Model:
response_text += chunk
yield response_text
finally:
- # No cleanup needed. MMEmbedding lifetime is managed by Python.
- # Cache and page table resets are unnecessary and can cause token ID conflicts.
- pass
+ self.generator.clear_queue()
def generate(self, prompt, state):
output = ""
From a289a92b9408e2542632ffa600ef57c373200aec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 17:10:58 -0700
Subject: [PATCH 16/79] Fix exllamav3 token count
---
modules/exllamav3.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index e3a2d95a..268a64ec 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -267,9 +267,11 @@ class Exllamav3Model:
input_ids = input_ids[:, -get_max_prompt_length(state):]
+ self._last_prompt_token_count = input_ids.shape[-1]
+
# Determine max_new_tokens
if state['auto_max_new_tokens']:
- max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
+ max_new_tokens = state['truncation_length'] - self._last_prompt_token_count
else:
max_new_tokens = state['max_new_tokens']
@@ -323,8 +325,7 @@ class Exllamav3Model:
@property
def last_prompt_token_count(self):
- # This would need to be tracked during generation
- return 0
+ return getattr(self, '_last_prompt_token_count', 0)
def unload(self):
logger.info("Unloading ExLlamaV3 model components...")
From eb16f6401794340c751e8105bc3837967b9e054e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 17:12:16 -0700
Subject: [PATCH 17/79] Update llama.cpp
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_amd_noavx2.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 6 +++---
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/full/requirements_cuda128.txt | 4 ++--
requirements/full/requirements_cuda128_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 6 +++---
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/portable/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
requirements/portable/requirements_vulkan_noavx2.txt | 4 ++--
18 files changed, 38 insertions(+), 38 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index f17cae8a..323ef0f9 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 51f4571f..2a7c9361 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 37021c77..0106fbea 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index f54ae191..d5db4a1c 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index e495455b..694f1ff8 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 72847534..392637e2 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index ed641a24..88eaa930 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index d7fe735b..6accc2f0 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index cb71f74b..3025f092 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index d6bed576..7394bdcf 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 1f17dc50..a095a4c7 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 82254842..ea43e56e 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 986a3d49..79737728 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 833e923b..d39786bd 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 6a894d49..0b373fa9 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 0afb19c2..fe9dccac 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index a404f50c..b3cfd525 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 75176656..02aa03e3 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From d86b0ec0103f24f27329a600d1cbaf3a5ea4c517 Mon Sep 17 00:00:00 2001
From: oobabooga
Date: Sun, 10 Aug 2025 01:27:25 -0300
Subject: [PATCH 18/79] Add multimodal support (llama.cpp) (#7027)
---
extensions/openai/image_utils.py | 9 ++++
modules/chat.py | 26 ++++++------
modules/llama_cpp_server.py | 46 ++++++++++++++++++---
modules/loaders.py | 2 +
modules/shared.py | 1 +
modules/ui.py | 1 +
modules/ui_model_menu.py | 6 +++
modules/utils.py | 13 ++++++
user_data/mmproj/place-your-mmproj-here.txt | 0
9 files changed, 86 insertions(+), 18 deletions(-)
create mode 100644 user_data/mmproj/place-your-mmproj-here.txt
diff --git a/extensions/openai/image_utils.py b/extensions/openai/image_utils.py
index c54f0532..658f00d7 100644
--- a/extensions/openai/image_utils.py
+++ b/extensions/openai/image_utils.py
@@ -11,6 +11,15 @@ from PIL import Image
from modules.logging_colors import logger
+def convert_pil_to_base64(image: Image.Image) -> str:
+ """Converts a PIL Image to a base64 encoded string."""
+ buffered = io.BytesIO()
+ # Save image to an in-memory bytes buffer in PNG format
+ image.save(buffered, format="PNG")
+ # Encode the bytes to a base64 string
+ return base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+
def decode_base64_image(base64_string: str) -> Image.Image:
"""Decodes a base64 string to a PIL Image."""
try:
diff --git a/modules/chat.py b/modules/chat.py
index 639feebf..696fa350 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -813,19 +813,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
for file_path in files:
add_message_attachment(output, row_idx, file_path, is_user=True)
- # Collect image attachments for multimodal generation
- image_attachments = []
- if 'metadata' in output:
- user_key = f"user_{row_idx}"
- if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
- for attachment in output['metadata'][user_key]["attachments"]:
- if attachment.get("type") == "image":
- image_attachments.append(attachment)
-
- # Add image attachments to state for the generation
- if image_attachments:
- state['image_attachments'] = image_attachments
-
# Add web search results as attachments if enabled
if state.get('enable_web_search', False):
search_query = generate_search_query(text, state)
@@ -881,6 +868,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
'metadata': output['metadata']
}
+ # Collect image attachments for multimodal generation
+ image_attachments = []
+ if 'metadata' in output:
+ user_key = f"user_{row_idx}"
+ if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
+ for attachment in output['metadata'][user_key]["attachments"]:
+ if attachment.get("type") == "image":
+ image_attachments.append(attachment)
+
+ # Add image attachments to state for the generation
+ if image_attachments:
+ state['image_attachments'] = image_attachments
+
# Generate the prompt
kwargs = {
'_continue': _continue,
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index e64f1694..072ff83b 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -12,6 +12,10 @@ from pathlib import Path
import llama_cpp_binaries
import requests
+from extensions.openai.image_utils import (
+ convert_image_attachments_to_pil,
+ convert_pil_to_base64
+)
from modules import shared
from modules.logging_colors import logger
@@ -128,15 +132,40 @@ class LlamaServer:
url = f"http://127.0.0.1:{self.port}/completion"
payload = self.prepare_payload(state)
- token_ids = self.encode(prompt, add_bos_token=state["add_bos_token"])
- self.last_prompt_token_count = len(token_ids)
+ pil_images = []
+ # Check for images from the Web UI (image_attachments)
+ if 'image_attachments' in state and state['image_attachments']:
+ pil_images.extend(convert_image_attachments_to_pil(state['image_attachments']))
+ # Else, check for images from the API (raw_images)
+ elif 'raw_images' in state and state['raw_images']:
+ pil_images.extend(state.get('raw_images', []))
+
+ if pil_images:
+ # Multimodal case
+ IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image
+
+ base64_images = [convert_pil_to_base64(img) for img in pil_images]
+ multimodal_prompt_object = {
+ "prompt": prompt,
+ "multimodal_data": base64_images
+ }
+ payload["prompt"] = multimodal_prompt_object
+
+ # Calculate an estimated token count
+ text_tokens = self.encode(prompt, add_bos_token=state["add_bos_token"])
+ self.last_prompt_token_count = len(text_tokens) + (len(pil_images) * IMAGE_TOKEN_COST_ESTIMATE)
+ else:
+ # Text only case
+ token_ids = self.encode(prompt, add_bos_token=state["add_bos_token"])
+ self.last_prompt_token_count = len(token_ids)
+ payload["prompt"] = token_ids
+
if state['auto_max_new_tokens']:
- max_new_tokens = state['truncation_length'] - len(token_ids)
+ max_new_tokens = state['truncation_length'] - self.last_prompt_token_count
else:
max_new_tokens = state['max_new_tokens']
payload.update({
- "prompt": token_ids,
"n_predict": max_new_tokens,
"stream": True,
"cache_prompt": True
@@ -144,7 +173,7 @@ class LlamaServer:
if shared.args.verbose:
logger.info("GENERATE_PARAMS=")
- printable_payload = {k: v for k, v in payload.items() if k != "prompt"}
+ printable_payload = {k: (v if k != "prompt" else "[multimodal object]" if pil_images else v) for k, v in payload.items()}
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print()
@@ -295,6 +324,13 @@ class LlamaServer:
cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
if shared.args.rope_freq_base > 0:
cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
+ if shared.args.mmproj not in [None, 'None']:
+ path = Path(shared.args.mmproj)
+ if not path.exists():
+ path = Path('user_data/mmproj') / shared.args.mmproj
+
+ if path.exists():
+ cmd += ["--mmproj", str(path)]
if shared.args.model_draft not in [None, 'None']:
path = Path(shared.args.model_draft)
if not path.exists():
diff --git a/modules/loaders.py b/modules/loaders.py
index 151de990..feca9985 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -28,6 +28,8 @@ loaders_and_params = OrderedDict({
'device_draft',
'ctx_size_draft',
'speculative_decoding_accordion',
+ 'mmproj',
+ 'mmproj_accordion',
'vram_info',
],
'Transformers': [
diff --git a/modules/shared.py b/modules/shared.py
index 1de4306b..e9d8a62f 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -85,6 +85,7 @@ group.add_argument('--no-kv-offload', action='store_true', help='Do not offload
group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"')
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
+group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.')
# Cache
group = parser.add_argument_group('Context and cache')
diff --git a/modules/ui.py b/modules/ui.py
index e7805046..1171cd48 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -167,6 +167,7 @@ def list_model_elements():
'gpu_layers_draft',
'device_draft',
'ctx_size_draft',
+ 'mmproj',
]
return elements
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 0ab67e7a..9fa8a4f4 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -59,6 +59,12 @@ def create_ui():
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
+ # Multimodal
+ with gr.Accordion("Multimodal (vision)", open=False, elem_classes='tgw-accordion') as shared.gradio['mmproj_accordion']:
+ with gr.Row():
+ shared.gradio['mmproj'] = gr.Dropdown(label="mmproj file", choices=utils.get_available_mmproj(), value=lambda: shared.args.mmproj or 'None', elem_classes='slim-dropdown', info='Select a file that matches your model. Must be placed in user_data/mmproj/', interactive=not mu)
+ ui.create_refresh_button(shared.gradio['mmproj'], lambda: None, lambda: {'choices': utils.get_available_mmproj()}, 'refresh-button', interactive=not mu)
+
# Speculative decoding
with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
with gr.Row():
diff --git a/modules/utils.py b/modules/utils.py
index 117ad590..4927ef04 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -154,6 +154,19 @@ def get_available_ggufs():
return sorted(model_list, key=natural_keys)
+def get_available_mmproj():
+ mmproj_dir = Path('user_data/mmproj')
+ if not mmproj_dir.exists():
+ return ['None']
+
+ mmproj_files = []
+ for item in mmproj_dir.iterdir():
+ if item.is_file() and item.suffix.lower() in ('.gguf', '.bin'):
+ mmproj_files.append(item.name)
+
+ return ['None'] + sorted(mmproj_files, key=natural_keys)
+
+
def get_available_presets():
return sorted(set((k.stem for k in Path('user_data/presets').glob('*.yaml'))), key=natural_keys)
diff --git a/user_data/mmproj/place-your-mmproj-here.txt b/user_data/mmproj/place-your-mmproj-here.txt
new file mode 100644
index 00000000..e69de29b
From c6b4d1e87f67dd990c66eaecb35c9cc70d0ae4e3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 21:33:12 -0700
Subject: [PATCH 19/79] Fix the exllamav2 loader ignoring add_bos
---
modules/exllamav2.py | 3 ++-
modules/exllamav3.py | 1 -
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 6bb422ea..5d5c5b56 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -135,7 +135,8 @@ class Exllamav2Model:
return result, result
def encode(self, string, **kwargs):
- return self.tokenizer.encode(string, add_bos=True, encode_special_tokens=True)
+ add_bos = kwargs.pop('add_bos', True)
+ return self.tokenizer.encode(string, add_bos=add_bos, encode_special_tokens=True, **kwargs)
def decode(self, ids, **kwargs):
if isinstance(ids, list):
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 268a64ec..9201801c 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -316,7 +316,6 @@ class Exllamav3Model:
return output
def encode(self, string, **kwargs):
- # Default add_bos to True for consistency with exllamav2 behavior
add_bos = kwargs.pop('add_bos', True)
return self.tokenizer.encode(string, add_bos=add_bos, **kwargs)
From 2f90ac98807a4ffa6a761bbcef5cf81a9de568b8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 21:41:38 -0700
Subject: [PATCH 20/79] Move the new image_utils.py file to modules/
---
extensions/openai/completions.py | 2 +-
modules/exllamav3.py | 4 ++--
{extensions/openai => modules}/image_utils.py | 0
modules/llama_cpp_server.py | 4 ++--
4 files changed, 5 insertions(+), 5 deletions(-)
rename {extensions/openai => modules}/image_utils.py (100%)
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index ff64527a..f4944060 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -7,7 +7,6 @@ import tiktoken
from pydantic import ValidationError
from extensions.openai.errors import InvalidRequestError
-from extensions.openai.image_utils import convert_openai_messages_to_images
from extensions.openai.typing import ToolDefinition
from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
from modules import shared
@@ -17,6 +16,7 @@ from modules.chat import (
load_character_memoized,
load_instruction_template_memoized
)
+from modules.image_utils import convert_openai_messages_to_images
from modules.logging_colors import logger
from modules.presets import load_preset_memoized
from modules.text_generation import decode, encode, generate_reply
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 9201801c..9d597ce7 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -18,11 +18,11 @@ from exllamav3.generator.sampler import (
SS_TopP
)
-from extensions.openai.image_utils import (
+from modules import shared
+from modules.image_utils import (
convert_image_attachments_to_pil,
convert_openai_messages_to_images
)
-from modules import shared
from modules.logging_colors import logger
from modules.text_generation import get_max_prompt_length
from modules.torch_utils import clear_torch_cache
diff --git a/extensions/openai/image_utils.py b/modules/image_utils.py
similarity index 100%
rename from extensions/openai/image_utils.py
rename to modules/image_utils.py
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 072ff83b..3e8127ab 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -12,11 +12,11 @@ from pathlib import Path
import llama_cpp_binaries
import requests
-from extensions.openai.image_utils import (
+from modules import shared
+from modules.image_utils import (
convert_image_attachments_to_pil,
convert_pil_to_base64
)
-from modules import shared
from modules.logging_colors import logger
llamacpp_valid_cache_types = {"fp16", "q8_0", "q4_0"}
From 4663b1a56e99b8d637f9ac67c8b8d0e09d496ec7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 21:45:50 -0700
Subject: [PATCH 21/79] Update docs
---
docs/12 - OpenAI API.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index b7b5fbc1..fc76cd8b 100644
--- a/docs/12 - OpenAI API.md
+++ b/docs/12 - OpenAI API.md
@@ -77,7 +77,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
}'
```
-#### Multimodal support (ExLlamaV3)
+#### Multimodal support (llama.cpp and ExLlamaV3)
```shell
curl http://127.0.0.1:5000/v1/chat/completions \
From 0ea62d88f60689b44dd4ee42ae9ba0ff871a29c2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 21:47:02 -0700
Subject: [PATCH 22/79] mtmd: Fix "continue" when an image is present
---
modules/chat.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/modules/chat.py b/modules/chat.py
index 696fa350..42bb58a5 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -868,6 +868,8 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
'metadata': output['metadata']
}
+ row_idx = len(output['internal']) - 1
+
# Collect image attachments for multimodal generation
image_attachments = []
if 'metadata' in output:
@@ -895,7 +897,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
prompt = generate_chat_prompt(text, state, **kwargs)
# Add timestamp for assistant's response at the start of generation
- row_idx = len(output['internal']) - 1
update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp(), model_name=shared.model_name)
# Generate
From 1fb580785937ad42e8657a5fd894dfcd5a1fdeb3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Aug 2025 06:54:44 -0700
Subject: [PATCH 23/79] mtmd: Fix API text completion when no images are sent
---
extensions/openai/completions.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index f4944060..6f4dfc29 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -405,7 +405,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
echo = body['echo']
# Add messages to generate_params if present for multimodal processing
- if 'messages' in body:
+ if body.get('messages'):
generate_params['messages'] = body['messages']
raw_images = convert_openai_messages_to_images(generate_params['messages'])
if raw_images:
From 6fbf162d712cef876b128651fdebeb08a4f32538 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Aug 2025 07:21:55 -0700
Subject: [PATCH 24/79] Default max_tokens to 512 in the API instead of 16
---
extensions/openai/typing.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index e9f92da5..90366270 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -106,7 +106,7 @@ class CompletionRequestParams(BaseModel):
frequency_penalty: float | None = 0
logit_bias: dict | None = None
logprobs: int | None = None
- max_tokens: int | None = 16
+ max_tokens: int | None = 512
n: int | None = Field(default=1, description="Unused parameter.")
presence_penalty: float | None = 0
stop: str | List[str] | None = None
@@ -232,7 +232,7 @@ class LogitsRequestParams(BaseModel):
use_samplers: bool = False
top_logits: int | None = 50
frequency_penalty: float | None = 0
- max_tokens: int | None = 16
+ max_tokens: int | None = 512
presence_penalty: float | None = 0
temperature: float | None = 1
top_p: float | None = 1
From cc964ee579463d4c3acb35c188ee8eb38e23ce1a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Aug 2025 07:44:38 -0700
Subject: [PATCH 25/79] mtmd: Increase the size of the UI image preview
---
css/main.css | 1 +
1 file changed, 1 insertion(+)
diff --git a/css/main.css b/css/main.css
index de16d81d..062d3eb2 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1579,6 +1579,7 @@ strong {
.image-attachment {
flex-direction: column;
+ max-width: 314px;
}
.image-preview {
From 9ec310d858824d5f9186bca277a3ab77ac556b75 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Aug 2025 07:54:21 -0700
Subject: [PATCH 26/79] UI: Fix the color of italic text
---
css/html_instruct_style.css | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 9831ee8f..3e5ebe67 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -13,7 +13,7 @@
line-height: 28px !important;
}
-.dark .chat .message-body :is(p, li, q, h1, h2, h3, h4, h5, h6) {
+.dark .chat .message-body :is(p, li, q, em, h1, h2, h3, h4, h5, h6) {
color: #d1d5db !important;
}
From c5340533c0b3a9edaea6c253f99250f09f2c26a5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Aug 2025 20:39:04 -0700
Subject: [PATCH 27/79] mtmd: Add another API example
---
docs/12 - OpenAI API.md | 38 ++++++++++++++++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index fc76cd8b..5dc98a51 100644
--- a/docs/12 - OpenAI API.md
+++ b/docs/12 - OpenAI API.md
@@ -77,7 +77,9 @@ curl http://127.0.0.1:5000/v1/chat/completions \
}'
```
-#### Multimodal support (llama.cpp and ExLlamaV3)
+#### Multimodal/vision (llama.cpp and ExLlamaV3)
+
+##### /v1/chat/completions (recommended!)
```shell
curl http://127.0.0.1:5000/v1/chat/completions \
@@ -87,7 +89,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
{
"role": "user",
"content": [
- {"type": "text", "text": "What color is this image?"},
+ {"type": "text", "text": "Please describe what you see in this image."},
{"type": "image_url", "image_url": {"url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"}}
]
}
@@ -95,6 +97,38 @@ curl http://127.0.0.1:5000/v1/chat/completions \
}'
```
+##### /v1/completions
+
+```shell
+curl http://127.0.0.1:5000/v1/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": "About image <__media__> and image <__media__>, what I can say is that the first one"
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"
+ }
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/strawberry.png?raw=true"
+ }
+ }
+ ]
+ }
+ ]
+ }'
+```
+
#### SSE streaming
```shell
From 4d8dbbab648d14680741324d187bee23a8bea486 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:26:11 -0700
Subject: [PATCH 28/79] API: Fix sampler_priority usage for ExLlamaV3
---
modules/exllamav3.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 9d597ce7..8f686669 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -237,7 +237,7 @@ class Exllamav3Model:
# 3. Get the priority list and handle temperature_last
default_priority = ['repetition_penalty', 'presence_frequency_penalty', 'top_k', 'top_p', 'min_p', 'temperature']
- sampler_priority = state.get('sampler_priority', default_priority)
+ sampler_priority = state.get('sampler_priority') or default_priority
if state['temperature_last'] and 'temperature' in sampler_priority:
sampler_priority.append(sampler_priority.pop(sampler_priority.index('temperature')))
From 4809ddfeb85e8b8d28bb617366c86fd8037815ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:35:22 -0700
Subject: [PATCH 29/79] Exllamav3: small sampler fixes
---
modules/exllamav3.py | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 8f686669..5c142ec2 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -6,7 +6,9 @@ import torch
from exllamav3 import Cache, Config, Generator, Model, Tokenizer
from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
from exllamav3.generator import Job
-from exllamav3.generator.sampler import (
+
+from modules import shared
+from modules.exllamav3_custom_sampler import (
CustomSampler,
SS_Argmax,
SS_MinP,
@@ -17,8 +19,6 @@ from exllamav3.generator.sampler import (
SS_TopK,
SS_TopP
)
-
-from modules import shared
from modules.image_utils import (
convert_image_attachments_to_pil,
convert_openai_messages_to_images
@@ -194,7 +194,6 @@ class Exllamav3Model:
# Process images and modify prompt (ExLlamaV3-specific)
prompt, image_embeddings = self._process_images_for_generation(prompt, state)
- # -- Manually build and sort the sampler stack --
# Greedy decoding is a special case
if state['temperature'] == 0:
sampler = CustomSampler([SS_Argmax()])
@@ -205,7 +204,7 @@ class Exllamav3Model:
# Penalties
penalty_range = state['repetition_penalty_range']
if penalty_range <= 0:
- penalty_range = -1 # ExllamaV3 uses -1 for whole context
+ penalty_range = int(10e7) # Use large number for "full context"
rep_decay = 0 # Not a configurable parameter
# Add penalty samplers if they are active
@@ -222,7 +221,7 @@ class Exllamav3Model:
if state['min_p'] > 0.0:
unordered_samplers.append(SS_MinP(state['min_p']))
- # Temperature
+ # Temperature (SS_NoOp is returned if temp is 1.0)
unordered_samplers.append(SS_Temperature(state['temperature']))
# 2. Define the mapping from class names to the priority list keys
@@ -246,7 +245,7 @@ class Exllamav3Model:
def custom_sort_key(sampler_obj):
class_name = sampler_obj.__class__.__name__
nickname = class_name_to_nickname.get(class_name)
- if nickname in sampler_priority:
+ if nickname and nickname in sampler_priority:
return sampler_priority.index(nickname)
return -1
@@ -255,7 +254,6 @@ class Exllamav3Model:
# 5. Add the final sampling stage and build the sampler
ordered_samplers.append(SS_Sample())
sampler = CustomSampler(ordered_samplers)
- # -- End of sampler building --
# Encode prompt with embeddings (ExLlamaV3-specific)
input_ids = self.tokenizer.encode(
From 1cb800d3923093e102470c6dde4e4a8b451e0a33 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:37:10 -0700
Subject: [PATCH 30/79] Docs: small change
---
docs/12 - OpenAI API.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 5dc98a51..fd3309c7 100644
--- a/docs/12 - OpenAI API.md
+++ b/docs/12 - OpenAI API.md
@@ -79,7 +79,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
#### Multimodal/vision (llama.cpp and ExLlamaV3)
-##### /v1/chat/completions (recommended!)
+##### With /v1/chat/completions (recommended!)
```shell
curl http://127.0.0.1:5000/v1/chat/completions \
@@ -97,7 +97,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
}'
```
-##### /v1/completions
+##### With /v1/completions
```shell
curl http://127.0.0.1:5000/v1/completions \
From 52d1cbbbe95bed853241ab422b0558ab029d7d08 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:38:39 -0700
Subject: [PATCH 31/79] Fix an import
---
modules/exllamav3.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 5c142ec2..3fabdb6b 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -8,7 +8,7 @@ from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
from exllamav3.generator import Job
from modules import shared
-from modules.exllamav3_custom_sampler import (
+from exllamav3.generator.sampler import (
CustomSampler,
SS_Argmax,
SS_MinP,
From 38c0b4a1adc613e9e3a237835faa1d88632733ef Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:39:53 -0700
Subject: [PATCH 32/79] Default ctx-size to 8192 when not found in the metadata
---
modules/models_settings.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index d3bf4a36..bf7b1cf9 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -15,7 +15,7 @@ from modules.logging_colors import logger
def get_fallback_settings():
return {
'bf16': False,
- 'ctx_size': 2048,
+ 'ctx_size': 8192,
'rope_freq_base': 0,
'compress_pos_emb': 1,
'alpha_value': 1,
From b62c8845f34f3faac2481e368e6a4c67fd33fa59 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 08:22:17 -0700
Subject: [PATCH 33/79] mtmd: Fix /chat/completions for llama.cpp
---
extensions/openai/completions.py | 18 +++++++++++++++---
modules/chat.py | 21 +++++++++++----------
modules/llama_cpp_server.py | 8 ++++++--
3 files changed, 32 insertions(+), 15 deletions(-)
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 6f4dfc29..c3037d0c 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -85,16 +85,28 @@ def process_parameters(body, is_legacy=False):
def process_multimodal_content(content):
- """Extract text from OpenAI multimodal format for non-multimodal models"""
+ """Extract text and add image placeholders from OpenAI multimodal format"""
if isinstance(content, str):
return content
if isinstance(content, list):
text_parts = []
+ image_placeholders = ""
for item in content:
- if isinstance(item, dict) and item.get('type') == 'text':
+ if not isinstance(item, dict):
+ continue
+
+ item_type = item.get('type', '')
+ if item_type == 'text':
text_parts.append(item.get('text', ''))
- return ' '.join(text_parts) if text_parts else str(content)
+ elif item_type == 'image_url':
+ image_placeholders += "<__media__>"
+
+ final_text = ' '.join(text_parts)
+ if image_placeholders:
+ return f"{image_placeholders}\n\n{final_text}"
+ else:
+ return final_text
return str(content)
diff --git a/modules/chat.py b/modules/chat.py
index 42bb58a5..7b1629dd 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -870,18 +870,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
row_idx = len(output['internal']) - 1
- # Collect image attachments for multimodal generation
- image_attachments = []
+ # Collect image attachments for multimodal generation from the entire history
+ all_image_attachments = []
if 'metadata' in output:
- user_key = f"user_{row_idx}"
- if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
- for attachment in output['metadata'][user_key]["attachments"]:
- if attachment.get("type") == "image":
- image_attachments.append(attachment)
+ for i in range(len(output['internal'])):
+ user_key = f"user_{i}"
+ if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
+ for attachment in output['metadata'][user_key]["attachments"]:
+ if attachment.get("type") == "image":
+ all_image_attachments.append(attachment)
- # Add image attachments to state for the generation
- if image_attachments:
- state['image_attachments'] = image_attachments
+ # Add all collected image attachments to state for the generation
+ if all_image_attachments:
+ state['image_attachments'] = all_image_attachments
# Generate the prompt
kwargs = {
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 3e8127ab..63c8eda0 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -15,6 +15,7 @@ import requests
from modules import shared
from modules.image_utils import (
convert_image_attachments_to_pil,
+ convert_openai_messages_to_images,
convert_pil_to_base64
)
from modules.logging_colors import logger
@@ -133,10 +134,13 @@ class LlamaServer:
payload = self.prepare_payload(state)
pil_images = []
- # Check for images from the Web UI (image_attachments)
+ # Source 1: Web UI (from chatbot_wrapper)
if 'image_attachments' in state and state['image_attachments']:
pil_images.extend(convert_image_attachments_to_pil(state['image_attachments']))
- # Else, check for images from the API (raw_images)
+ # Source 2: Chat Completions API (/v1/chat/completions)
+ elif 'history' in state and state.get('history', {}).get('messages'):
+ pil_images.extend(convert_openai_messages_to_images(state['history']['messages']))
+ # Source 3: Legacy Completions API (/v1/completions)
elif 'raw_images' in state and state['raw_images']:
pil_images.extend(state.get('raw_images', []))
From b10d525bf7618c415c88937e26c1a2240c3b2fcf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:05:22 -0700
Subject: [PATCH 34/79] UI: Update a tooltip
---
js/main.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/js/main.js b/js/main.js
index e0f9314d..66a344b3 100644
--- a/js/main.js
+++ b/js/main.js
@@ -977,7 +977,7 @@ if (document.readyState === "loading") {
//------------------------------------------------
// File upload button
-document.querySelector("#chat-input .upload-button").title = "Upload text files, PDFs, and DOCX documents";
+document.querySelector("#chat-input .upload-button").title = "Upload text files, PDFs, DOCX documents, and images";
// Activate web search
document.getElementById("web-search").title = "Search the internet with DuckDuckGo";
From 1ba1211ca027b887f160f6894a004b7d96ea0eee Mon Sep 17 00:00:00 2001
From: Mykeehu
Date: Mon, 11 Aug 2025 21:13:56 +0200
Subject: [PATCH 35/79] Fix edit window and buttons in Messenger theme (#7100)
---
css/chat_style-messenger.css | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css
index 65af5f7a..583703c0 100644
--- a/css/chat_style-messenger.css
+++ b/css/chat_style-messenger.css
@@ -99,3 +99,9 @@
.message-body p em {
color: rgb(110 110 110) !important;
}
+.editing-textarea {
+ width: max(30rem) !important;
+}
+.circle-you + .text .edit-control-button, .circle-you + .text .editing-textarea {
+ color: #000 !important;
+}
From 999471256c0626bb29e9caa65bbf96b8d2cb52d6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:32:17 -0700
Subject: [PATCH 36/79] Lint
---
modules/exllamav2.py | 2 +-
modules/exllamav3.py | 11 ++++-------
2 files changed, 5 insertions(+), 8 deletions(-)
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 5d5c5b56..3b3233d2 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -3,6 +3,7 @@ import traceback
from pathlib import Path
import torch
+
from exllamav2 import (
ExLlamaV2,
ExLlamaV2Cache,
@@ -15,7 +16,6 @@ from exllamav2 import (
ExLlamaV2Tokenizer
)
from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
-
from modules import shared
from modules.logging_colors import logger
from modules.text_generation import get_max_prompt_length
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 3fabdb6b..980230f8 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -2,12 +2,9 @@ import traceback
from pathlib import Path
from typing import Any, List, Tuple
-import torch
from exllamav3 import Cache, Config, Generator, Model, Tokenizer
from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
from exllamav3.generator import Job
-
-from modules import shared
from exllamav3.generator.sampler import (
CustomSampler,
SS_Argmax,
@@ -19,13 +16,13 @@ from exllamav3.generator.sampler import (
SS_TopK,
SS_TopP
)
+from modules import shared
from modules.image_utils import (
convert_image_attachments_to_pil,
convert_openai_messages_to_images
)
from modules.logging_colors import logger
from modules.text_generation import get_max_prompt_length
-from modules.torch_utils import clear_torch_cache
try:
import flash_attn
@@ -205,13 +202,13 @@ class Exllamav3Model:
penalty_range = state['repetition_penalty_range']
if penalty_range <= 0:
penalty_range = int(10e7) # Use large number for "full context"
- rep_decay = 0 # Not a configurable parameter
+ rep_decay = 0 # Not a configurable parameter
# Add penalty samplers if they are active
if state['repetition_penalty'] != 1.0:
- unordered_samplers.append(SS_RepP(state['repetition_penalty'], penalty_range, rep_decay))
+ unordered_samplers.append(SS_RepP(state['repetition_penalty'], penalty_range, rep_decay))
if state['presence_penalty'] != 0.0 or state['frequency_penalty'] != 0.0:
- unordered_samplers.append(SS_PresFreqP(state['presence_penalty'], state['frequency_penalty'], penalty_range, rep_decay))
+ unordered_samplers.append(SS_PresFreqP(state['presence_penalty'], state['frequency_penalty'], penalty_range, rep_decay))
# Standard samplers
if state['top_k'] > 0:
From a78ca6ffcdf0c53efdce8bfa6b37825590f5ae6e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:33:38 -0700
Subject: [PATCH 37/79] Remove a comment
---
modules/text_generation.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index d6a87ce8..27c5de7d 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -295,8 +295,6 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
_StopEverythingStoppingCriteria
)
- # Native ExLlamav3Model handles multimodal internally - no special routing needed
-
if shared.args.loader == 'Transformers':
clear_torch_cache()
From 765af1ba1736b209427232d5bec1b2e55b099e1b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:39:18 -0700
Subject: [PATCH 38/79] API: Improve a validation
---
extensions/openai/typing.py | 17 ++++++-----------
1 file changed, 6 insertions(+), 11 deletions(-)
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 90366270..56d91582 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -2,7 +2,7 @@ import json
import time
from typing import Dict, List, Optional
-from pydantic import BaseModel, Field, field_validator, validator
+from pydantic import BaseModel, Field, model_validator, validator
class GenerationOptions(BaseModel):
@@ -116,16 +116,11 @@ class CompletionRequestParams(BaseModel):
top_p: float | None = 1
user: str | None = Field(default=None, description="Unused parameter.")
- @field_validator('prompt', 'messages')
- @classmethod
- def validate_prompt_or_messages(cls, v, info):
- """Ensure either 'prompt' or 'messages' is provided for completions."""
- if info.field_name == 'prompt': # If we're validating 'prompt', check if neither prompt nor messages will be set
- messages = info.data.get('messages')
- if v is None and messages is None:
- raise ValueError("Either 'prompt' or 'messages' must be provided")
-
- return v
+ @model_validator(mode='after')
+ def validate_prompt_or_messages(self):
+ if self.prompt is None and self.messages is None:
+ raise ValueError("Either 'prompt' or 'messages' must be provided")
+ return self
class CompletionRequest(GenerationOptions, CompletionRequestParams):
From 1e3c4e8bdbc3e8d313bfab016bc6f1853c4ad4b7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 14:40:59 -0700
Subject: [PATCH 39/79] Update llama.cpp
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_amd_noavx2.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 6 +++---
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/full/requirements_cuda128.txt | 4 ++--
requirements/full/requirements_cuda128_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 6 +++---
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/portable/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
requirements/portable/requirements_vulkan_noavx2.txt | 4 ++--
18 files changed, 38 insertions(+), 38 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 323ef0f9..789539fc 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 2a7c9361..d7922478 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 0106fbea..2a3337a3 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index d5db4a1c..7287497d 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 694f1ff8..48ebe381 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 392637e2..ccf80d06 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 88eaa930..e819dd04 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index 6accc2f0..8b9c882c 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index 3025f092..ce81c5ff 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 7394bdcf..6233b84a 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index a095a4c7..e3a863ec 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index ea43e56e..26f813d2 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 79737728..4de1159d 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index d39786bd..fded9898 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 0b373fa9..013364ff 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index fe9dccac..85e95eb3 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index b3cfd525..945dcf49 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 02aa03e3..bf1eff03 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From 0e88a621fd96bc75b908d078972ab8117e957f55 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 15:16:03 -0700
Subject: [PATCH 40/79] UI: Better organize the right sidebar
---
modules/ui_chat.py | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 3b922fb4..94c980bb 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -78,12 +78,19 @@ def create_ui():
with gr.Row():
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
+ gr.HTML("")
+
shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.')
+
+ gr.HTML("")
+
shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search')
with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)
+ gr.HTML("")
+
with gr.Row():
shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
@@ -93,6 +100,8 @@ def create_ui():
with gr.Row():
shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
+ gr.HTML("")
+
with gr.Row():
shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm')
From 0e3def449a8bf71ab40c052e4206f612aeba0a60 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 15:17:25 -0700
Subject: [PATCH 41/79] llama.cpp: --swa-full to llama-server when
streaming-llm is checked
---
modules/llama_cpp_server.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 63c8eda0..58534f26 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -356,6 +356,7 @@ class LlamaServer:
cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
if shared.args.streaming_llm:
cmd += ["--cache-reuse", "1"]
+ cmd += ["--swa-full"]
if shared.args.extra_flags:
# Clean up the input
extra_flags = shared.args.extra_flags.strip()
From c47e6deda279f27c7bff1a31351e72c0d5025052 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 16:20:20 -0700
Subject: [PATCH 42/79] Update README
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 907d8c38..6e59f7da 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
- 100% offline and private, with zero telemetry, external resources, or remote update requests.
- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
+- **Vision (multimodal models)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal%E2Tutorial)).
- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
- Aesthetic UI with dark and light themes.
- Syntax highlighting for code blocks and LaTeX rendering for mathematical expressions.
From e6447cd24acbde845dbb4aa27acfd4c17b5c849c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 17:42:35 -0700
Subject: [PATCH 43/79] mtmd: Update the llama-server request
---
modules/llama_cpp_server.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 58534f26..e82edb90 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -149,11 +149,10 @@ class LlamaServer:
IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image
base64_images = [convert_pil_to_base64(img) for img in pil_images]
- multimodal_prompt_object = {
- "prompt": prompt,
+ payload["prompt"] = {
+ "prompt_string": prompt,
"multimodal_data": base64_images
}
- payload["prompt"] = multimodal_prompt_object
# Calculate an estimated token count
text_tokens = self.encode(prompt, add_bos_token=state["add_bos_token"])
From d8fcc71616307a8ecacea93b7bdfa1117a23e1fe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 18:02:33 -0700
Subject: [PATCH 44/79] mtmd: Fail early if images are provided but the model
doesn't support them (llama.cpp)
---
modules/llama_cpp_server.py | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index e82edb90..51dacb84 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -34,6 +34,7 @@ class LlamaServer:
self.process = None
self.session = requests.Session()
self.vocabulary_size = None
+ self.has_multimodal = False
self.bos_token = ""
self.last_prompt_token_count = 0
@@ -144,6 +145,10 @@ class LlamaServer:
elif 'raw_images' in state and state['raw_images']:
pil_images.extend(state.get('raw_images', []))
+ # Fail early if images are provided but the model doesn't support them
+ if pil_images and not self.has_multimodal:
+ raise RuntimeError("The loaded llama.cpp model does not support multimodal requests. You must load a vision model and provide an mmproj file.")
+
if pil_images:
# Multimodal case
IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image
@@ -261,8 +266,8 @@ class LlamaServer:
else:
raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
- def _get_vocabulary_size(self):
- """Get and store the model's maximum context length."""
+ def _get_model_properties(self):
+ """Get and store the model's properties, including vocab size and multimodal capability."""
url = f"http://127.0.0.1:{self.port}/v1/models"
response = self.session.get(url).json()
@@ -271,6 +276,10 @@ class LlamaServer:
if "meta" in model_info and "n_vocab" in model_info["meta"]:
self.vocabulary_size = model_info["meta"]["n_vocab"]
+ # Check for multimodal capability
+ if "capabilities" in model_info and "multimodal" in model_info["capabilities"]:
+ self.has_multimodal = True
+
def _get_bos_token(self):
"""Get and store the model's BOS token."""
url = f"http://127.0.0.1:{self.port}/props"
@@ -421,7 +430,7 @@ class LlamaServer:
time.sleep(1)
# Server is now healthy, get model info
- self._get_vocabulary_size()
+ self._get_model_properties()
self._get_bos_token()
return self.port
From 0882970a9445badcd953f27e4e10ecf869c103a5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 07:00:24 -0700
Subject: [PATCH 45/79] Update llama.cpp
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_amd_noavx2.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 6 +++---
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/full/requirements_cuda128.txt | 4 ++--
requirements/full/requirements_cuda128_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 6 +++---
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/portable/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
requirements/portable/requirements_vulkan_noavx2.txt | 4 ++--
18 files changed, 38 insertions(+), 38 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 789539fc..eb7742b1 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index d7922478..47bcb60a 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 2a3337a3..6958ce37 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 7287497d..0890b2a5 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 48ebe381..da3010c6 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index ccf80d06..3a9a953b 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index e819dd04..a3e176d3 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index 8b9c882c..807d0a21 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index ce81c5ff..41e96574 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 6233b84a..72ba7103 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index e3a863ec..0c7f1d29 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 26f813d2..09f1c502 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 4de1159d..75296cb4 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index fded9898..ff3d7cb1 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 013364ff..97414bde 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 85e95eb3..7f543205 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 945dcf49..c1764ead 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index bf1eff03..142b67ec 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From 2238302b496a4145ee98e0eab0bf3d9f19a9c83b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 08:50:45 -0700
Subject: [PATCH 46/79] ExLlamaV3: Add speculative decoding
---
modules/exllamav3.py | 58 ++++++++++++++++++++++++++++++++++++++++++++
modules/loaders.py | 4 +++
2 files changed, 62 insertions(+)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 980230f8..7fc6c5b1 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -85,6 +85,7 @@ class Exllamav3Model:
cache = Cache(model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs)
load_params = {'progressbar': True}
+ split = None
if shared.args.gpu_split:
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
load_params['use_per_device'] = split
@@ -92,6 +93,45 @@ class Exllamav3Model:
model.load(**load_params)
tokenizer = Tokenizer.from_config(config)
+ # Initialize draft model for speculative decoding
+ draft_model = None
+ draft_cache = None
+ if shared.args.model_draft and shared.args.model_draft.lower() not in ["", "none"]:
+ logger.info(f"Loading draft model for speculative decoding: {shared.args.model_draft}")
+
+ draft_path = Path(shared.args.model_draft)
+ if not draft_path.is_dir():
+ draft_path = Path(f'{shared.args.model_dir}') / Path(shared.args.model_draft)
+
+ if not draft_path.is_dir():
+ logger.warning(f"Draft model not found at {draft_path}, speculative decoding disabled.")
+ else:
+ draft_config = Config.from_directory(str(draft_path))
+
+ # Set context size for draft model with 256-multiple validation
+ if shared.args.ctx_size_draft > 0:
+ draft_max_tokens = shared.args.ctx_size_draft
+ else:
+ draft_max_tokens = shared.args.ctx_size
+
+ # Validate draft model context size is a multiple of 256
+ if draft_max_tokens % 256 != 0:
+ adjusted_draft_tokens = ((draft_max_tokens // 256) + 1) * 256
+ logger.warning(f"Draft model max_num_tokens must be a multiple of 256. Adjusting from {draft_max_tokens} to {adjusted_draft_tokens}")
+ draft_max_tokens = adjusted_draft_tokens
+
+ draft_config.max_seq_len = draft_max_tokens
+
+ draft_model = Model.from_config(draft_config)
+ draft_cache = Cache(draft_model, max_num_tokens=draft_max_tokens, layer_type=layer_type, **cache_kwargs)
+
+ draft_load_params = {'progressbar': True}
+ if split:
+ draft_load_params['use_per_device'] = split
+
+ draft_model.load(**draft_load_params)
+ logger.info(f"Draft model loaded successfully. Max speculative tokens: {shared.args.draft_max}")
+
# Load vision model component (ExLlamaV3 native)
vision_model = None
if "vision_config" in config.config_dict:
@@ -109,6 +149,9 @@ class Exllamav3Model:
model=model,
cache=cache,
tokenizer=tokenizer,
+ draft_model=draft_model,
+ draft_cache=draft_cache,
+ num_speculative_tokens=shared.args.draft_max if draft_model is not None else 0,
)
result = cls()
@@ -119,6 +162,8 @@ class Exllamav3Model:
result.config = config
result.max_tokens = max_tokens
result.vision_model = vision_model
+ result.draft_model = draft_model
+ result.draft_cache = draft_cache
return result
@@ -289,6 +334,7 @@ class Exllamav3Model:
self.generator.enqueue(job)
response_text = ""
+
try:
while self.generator.num_remaining_jobs():
results = self.generator.iterate()
@@ -300,6 +346,7 @@ class Exllamav3Model:
if chunk:
response_text += chunk
yield response_text
+
finally:
self.generator.clear_queue()
@@ -331,6 +378,17 @@ class Exllamav3Model:
logger.warning(f"Error unloading vision model: {e}")
self.vision_model = None
+ if hasattr(self, 'draft_model') and self.draft_model is not None:
+ try:
+ self.draft_model.unload()
+ del self.draft_model
+ except Exception as e:
+ logger.warning(f"Error unloading draft model: {e}")
+ self.draft_model = None
+
+ if hasattr(self, 'draft_cache') and self.draft_cache is not None:
+ self.draft_cache = None
+
if hasattr(self, 'model') and self.model is not None:
try:
self.model.unload()
diff --git a/modules/loaders.py b/modules/loaders.py
index feca9985..8b7e6cce 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -61,6 +61,10 @@ loaders_and_params = OrderedDict({
'ctx_size',
'cache_type',
'gpu_split',
+ 'model_draft',
+ 'draft_max',
+ 'ctx_size_draft',
+ 'speculative_decoding_accordion',
],
'ExLlamav2_HF': [
'ctx_size',
From 2f6a629393afdb33e7fd355be10f6c72185412af Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 08:51:01 -0700
Subject: [PATCH 47/79] UI: Minor improvement after
0e88a621fd96bc75b908d078972ab8117e957f55
---
js/main.js | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/js/main.js b/js/main.js
index 66a344b3..4b4b14c2 100644
--- a/js/main.js
+++ b/js/main.js
@@ -583,7 +583,7 @@ function moveToChatTab() {
const chatControlsFirstChild = document.querySelector("#chat-controls").firstElementChild;
const newParent = chatControlsFirstChild;
- let newPosition = newParent.children.length - 2;
+ let newPosition = newParent.children.length - 3;
newParent.insertBefore(grandParent, newParent.children[newPosition]);
document.getElementById("save-character").style.display = "none";
From 8d7b88106a34102863a491a9c8848871c5118a85 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:20:16 -0700
Subject: [PATCH 48/79] Revert "mtmd: Fail early if images are provided but the
model doesn't support them (llama.cpp)"
This reverts commit d8fcc71616307a8ecacea93b7bdfa1117a23e1fe.
---
modules/llama_cpp_server.py | 15 +++------------
1 file changed, 3 insertions(+), 12 deletions(-)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 51dacb84..e82edb90 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -34,7 +34,6 @@ class LlamaServer:
self.process = None
self.session = requests.Session()
self.vocabulary_size = None
- self.has_multimodal = False
self.bos_token = ""
self.last_prompt_token_count = 0
@@ -145,10 +144,6 @@ class LlamaServer:
elif 'raw_images' in state and state['raw_images']:
pil_images.extend(state.get('raw_images', []))
- # Fail early if images are provided but the model doesn't support them
- if pil_images and not self.has_multimodal:
- raise RuntimeError("The loaded llama.cpp model does not support multimodal requests. You must load a vision model and provide an mmproj file.")
-
if pil_images:
# Multimodal case
IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image
@@ -266,8 +261,8 @@ class LlamaServer:
else:
raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
- def _get_model_properties(self):
- """Get and store the model's properties, including vocab size and multimodal capability."""
+ def _get_vocabulary_size(self):
+ """Get and store the model's maximum context length."""
url = f"http://127.0.0.1:{self.port}/v1/models"
response = self.session.get(url).json()
@@ -276,10 +271,6 @@ class LlamaServer:
if "meta" in model_info and "n_vocab" in model_info["meta"]:
self.vocabulary_size = model_info["meta"]["n_vocab"]
- # Check for multimodal capability
- if "capabilities" in model_info and "multimodal" in model_info["capabilities"]:
- self.has_multimodal = True
-
def _get_bos_token(self):
"""Get and store the model's BOS token."""
url = f"http://127.0.0.1:{self.port}/props"
@@ -430,7 +421,7 @@ class LlamaServer:
time.sleep(1)
# Server is now healthy, get model info
- self._get_model_properties()
+ self._get_vocabulary_size()
self._get_bos_token()
return self.port
From 7301452b4183efab97de71dae27486874a3d73f6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:23:24 -0700
Subject: [PATCH 49/79] UI: Minor info message change
---
modules/ui_model_menu.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 9fa8a4f4..6972a17e 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -42,7 +42,7 @@ def create_ui():
with gr.Row():
with gr.Column():
shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
- shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.')
+ shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
From 2f979ce2942efc82ad90dfc28c7407c473da5169 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:33:49 -0700
Subject: [PATCH 50/79] docs: Add a multimodal tutorial
---
docs/Multimodal Tutorial.md | 66 +++++++++++++++++++++++++++++++++++++
1 file changed, 66 insertions(+)
create mode 100644 docs/Multimodal Tutorial.md
diff --git a/docs/Multimodal Tutorial.md b/docs/Multimodal Tutorial.md
new file mode 100644
index 00000000..a30889f7
--- /dev/null
+++ b/docs/Multimodal Tutorial.md
@@ -0,0 +1,66 @@
+## Getting started
+
+### 1. Find a multimodal model
+
+GGUF models with vision capabilities are uploaded along a `mmproj` file to Hugging Face.
+
+For instance, [unsloth/gemma-3-4b-it-GGUF](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/tree/main) has this:
+
+
+
+### 2. Download the model to `user_data/models`
+
+As an example, download
+
+https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/resolve/main/gemma-3-4b-it-Q4_K_S.gguf?download=true
+
+to your `text-generation-webui/user_data/models` folder.
+
+### 3. Download the associated mmproj file to `user_data/mmproj`
+
+Then download
+
+https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/resolve/main/mmproj-F16.gguf?download=true
+
+to your `text-generation-webui/user_data/mmproj` folder. Name it `mmproj-gemma-3-4b-it-F16.gguf` to give it a recognizable name.
+
+### 4. Load the model
+
+1. Launch the web UI
+2. Navigate to the Model tab
+3. Select the GGUF model in the Model dropdown:
+
+
+
+4. Select the mmproj file in the Multimodal (vision) menu:
+
+
+
+5. Click "Load"
+
+### 5. Send a message with an image
+
+Select your image by clicking on the 📎 icon and send your message:
+
+
+
+The model will reply with great understanding of the image contents:
+
+
+
+## Multimodal with ExLlamaV3
+
+Multimodal also works with the ExLlamaV3 loader (the non-HF one).
+
+No additional files are necessary, just load a multimodal EXL3 model and send an image.
+
+Examples of models that you can use:
+
+- https://huggingface.co/turboderp/gemma-3-27b-it-exl3
+- https://huggingface.co/turboderp/Mistral-Small-3.1-24B-Instruct-2503-exl3
+
+## Multimodal API examples
+
+In the page below you can find some ready-to-use examples:
+
+[Multimodal/vision (llama.cpp and ExLlamaV3)](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#multimodalvision-llamacpp-and-exllamav3)
From 41b95e9ec3dada8a931abb1a1ca974529d12d177 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:37:37 -0700
Subject: [PATCH 51/79] Lint
---
modules/exllamav2.py | 2 +-
modules/exllamav3.py | 1 +
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 3b3233d2..5d5c5b56 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -3,7 +3,6 @@ import traceback
from pathlib import Path
import torch
-
from exllamav2 import (
ExLlamaV2,
ExLlamaV2Cache,
@@ -16,6 +15,7 @@ from exllamav2 import (
ExLlamaV2Tokenizer
)
from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
+
from modules import shared
from modules.logging_colors import logger
from modules.text_generation import get_max_prompt_length
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 7fc6c5b1..66e25693 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -16,6 +16,7 @@ from exllamav3.generator.sampler import (
SS_TopK,
SS_TopP
)
+
from modules import shared
from modules.image_utils import (
convert_image_attachments_to_pil,
From bd05fb899e7bb2889d616970e39f8cca41541e79 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 14:19:18 -0700
Subject: [PATCH 52/79] Update README
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 6e59f7da..5e7f37de 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
- 100% offline and private, with zero telemetry, external resources, or remote update requests.
- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
-- **Vision (multimodal models)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal%E2Tutorial)).
+- **Vision (multimodal models)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
- Aesthetic UI with dark and light themes.
- Syntax highlighting for code blocks and LaTeX rendering for mathematical expressions.
From 331eab81f785b08eabc41e320f59c45c42a7d73f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 13 Aug 2025 06:44:34 -0700
Subject: [PATCH 53/79] mtmd: Explain base64 inputs in the API docs
---
docs/12 - OpenAI API.md | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index fd3309c7..f85991f4 100644
--- a/docs/12 - OpenAI API.md
+++ b/docs/12 - OpenAI API.md
@@ -97,6 +97,8 @@ curl http://127.0.0.1:5000/v1/chat/completions \
}'
```
+For base64-encoded images, just replace the inner "url" value with the base64 string, formatted as `data:image/FORMAT;base64,BASE64_STRING` where FORMAT is the image type (image/png, image/jpeg, image/gif, etc.) and BASE64_STRING is your encoded image.
+
##### With /v1/completions
```shell
@@ -129,6 +131,8 @@ curl http://127.0.0.1:5000/v1/completions \
}'
```
+For base64-encoded images, just replace the inner "url" values with the base64 strings, formatted as `data:image/FORMAT;base64,BASE64_STRING` where FORMAT is the image type (image/png, image/jpeg, image/gif, etc.) and BASE64_STRING is your encoded image.
+
#### SSE streaming
```shell
From 725a8bcf60dd769aa2d62760750c74b772f9f504 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 13 Aug 2025 06:49:28 -0700
Subject: [PATCH 54/79] Small docs change
---
docs/12 - OpenAI API.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index f85991f4..83bba41f 100644
--- a/docs/12 - OpenAI API.md
+++ b/docs/12 - OpenAI API.md
@@ -97,7 +97,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
}'
```
-For base64-encoded images, just replace the inner "url" value with the base64 string, formatted as `data:image/FORMAT;base64,BASE64_STRING` where FORMAT is the image type (image/png, image/jpeg, image/gif, etc.) and BASE64_STRING is your encoded image.
+For base64-encoded images, just replace the inner "url" value with this format: `data:image/FORMAT;base64,BASE64_STRING` where FORMAT is the file type (png, jpeg, gif, etc.) and BASE64_STRING is your base64-encoded image data.
##### With /v1/completions
@@ -131,7 +131,7 @@ curl http://127.0.0.1:5000/v1/completions \
}'
```
-For base64-encoded images, just replace the inner "url" values with the base64 strings, formatted as `data:image/FORMAT;base64,BASE64_STRING` where FORMAT is the image type (image/png, image/jpeg, image/gif, etc.) and BASE64_STRING is your encoded image.
+For base64-encoded images, just replace the inner "url" values with this format: `data:image/FORMAT;base64,BASE64_STRING` where FORMAT is the file type (png, jpeg, gif, etc.) and BASE64_STRING is your base64-encoded image data.
#### SSE streaming
From 57f6e9af5a8defd67959672e8dc92040be91a0a5 Mon Sep 17 00:00:00 2001
From: altoiddealer
Date: Wed, 13 Aug 2025 15:47:27 -0400
Subject: [PATCH 55/79] Set multimodal status during Model Loading (#7199)
---
modules/exllamav3.py | 10 +++++-----
modules/llama_cpp_server.py | 24 ++++++++++++++++++++----
modules/models.py | 4 ++++
modules/shared.py | 1 +
4 files changed, 30 insertions(+), 9 deletions(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 66e25693..e580bbda 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -177,9 +177,6 @@ class Exllamav3Model:
Process all possible image inputs and return modified prompt + embeddings.
Returns: (processed_prompt, image_embeddings)
"""
- if not self.is_multimodal():
- return prompt, []
-
# Collect images from various sources using shared utilities
pil_images = []
@@ -234,8 +231,11 @@ class Exllamav3Model:
"""
Generate text with streaming using native ExLlamaV3 API
"""
- # Process images and modify prompt (ExLlamaV3-specific)
- prompt, image_embeddings = self._process_images_for_generation(prompt, state)
+ image_embeddings = []
+
+ if shared.is_multimodal:
+ # Process images and modify prompt (ExLlamaV3-specific)
+ prompt, image_embeddings = self._process_images_for_generation(prompt, state)
# Greedy decoding is a special case
if state['temperature'] == 0:
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index e82edb90..5953803a 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -8,6 +8,7 @@ import sys
import threading
import time
from pathlib import Path
+from typing import Any, List
import llama_cpp_binaries
import requests
@@ -129,10 +130,10 @@ class LlamaServer:
return payload
- def generate_with_streaming(self, prompt, state):
- url = f"http://127.0.0.1:{self.port}/completion"
- payload = self.prepare_payload(state)
-
+ def _process_images_for_generation(self, state: dict) -> List[Any]:
+ """
+ Process all possible image inputs and return PIL images
+ """
pil_images = []
# Source 1: Web UI (from chatbot_wrapper)
if 'image_attachments' in state and state['image_attachments']:
@@ -144,6 +145,21 @@ class LlamaServer:
elif 'raw_images' in state and state['raw_images']:
pil_images.extend(state.get('raw_images', []))
+ return pil_images
+
+ def is_multimodal(self) -> bool:
+ """Check if this model supports multimodal input."""
+ return shared.args.mmproj not in [None, 'None']
+
+ def generate_with_streaming(self, prompt, state):
+ url = f"http://127.0.0.1:{self.port}/completion"
+ payload = self.prepare_payload(state)
+
+ pil_images = []
+
+ if shared.is_multimodal:
+ pil_images = self._process_images_for_generation(state)
+
if pil_images:
# Multimodal case
IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image
diff --git a/modules/models.py b/modules/models.py
index cc500a40..938eed3d 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -55,6 +55,10 @@ def load_model(model_name, loader=None):
if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp':
shared.settings['truncation_length'] = shared.args.ctx_size
+ shared.is_multimodal = False
+ if loader.lower() in ('exllamav3', 'llama.cpp'):
+ shared.is_multimodal = model.is_multimodal()
+
logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
logger.info(f"LOADER: \"{loader}\"")
logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
diff --git a/modules/shared.py b/modules/shared.py
index e9d8a62f..a1f4571e 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -16,6 +16,7 @@ model = None
tokenizer = None
model_name = 'None'
is_seq2seq = False
+is_multimodal = False
model_dirty_from_training = False
lora_names = []
From 73a8a737b23fc195c52ef1d9021993fd13e28e33 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 13 Aug 2025 18:23:18 -0700
Subject: [PATCH 56/79] docs: Improve the multimodal examples slightly
---
docs/12 - OpenAI API.md | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 83bba41f..227541a3 100644
--- a/docs/12 - OpenAI API.md
+++ b/docs/12 - OpenAI API.md
@@ -93,7 +93,10 @@ curl http://127.0.0.1:5000/v1/chat/completions \
{"type": "image_url", "image_url": {"url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"}}
]
}
- ]
+ ],
+ "temperature": 0.6,
+ "top_p": 0.95,
+ "top_k": 20
}'
```
@@ -127,7 +130,10 @@ curl http://127.0.0.1:5000/v1/completions \
}
]
}
- ]
+ ],
+ "temperature": 0.6,
+ "top_p": 0.95,
+ "top_k": 20
}'
```
From d771ca4a13b9837e169cd44815bb3a86bc6c8a4b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 14 Aug 2025 12:02:30 -0700
Subject: [PATCH 57/79] Fix web search (attempt)
---
modules/web_search.py | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/modules/web_search.py b/modules/web_search.py
index 3b1f6e18..597af4b2 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -1,6 +1,8 @@
import concurrent.futures
import html
+import random
import re
+import urllib.request
from concurrent.futures import as_completed
from datetime import datetime
from urllib.parse import quote_plus
@@ -50,16 +52,21 @@ def download_web_page(url, timeout=10):
def perform_web_search(query, num_pages=3, max_workers=5, timeout=10):
"""Perform web search and return results with content"""
try:
- # Use DuckDuckGo HTML search endpoint
search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
- response = requests.get(search_url, headers=headers, timeout=timeout)
- response.raise_for_status()
+ agents = [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
+ ]
+
+ response_text = ""
+ req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)})
+ with urllib.request.urlopen(req, timeout=timeout) as response:
+ response_text = response.read().decode('utf-8')
# Extract results with regex
- titles = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response.text, re.DOTALL)
- urls = re.findall(r']*class="[^"]*result__url[^"]*"[^>]*>(.*?)', response.text, re.DOTALL)
+ titles = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response_text, re.DOTALL)
+ urls = re.findall(r']*class="[^"]*result__url[^"]*"[^>]*>(.*?)', response_text, re.DOTALL)
# Prepare download tasks
download_tasks = []
From dbabe67e776d46bb0a84987a9c484a59bd75d8db Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 13:19:11 -0700
Subject: [PATCH 58/79] ExLlamaV3: Enable the --enable-tp option, add a
--tp-backend option
---
modules/exllamav3.py | 5 +++++
modules/loaders.py | 2 ++
modules/shared.py | 6 +++++-
modules/ui.py | 1 +
modules/ui_model_menu.py | 4 +++-
5 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index e580bbda..73962977 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -91,6 +91,11 @@ class Exllamav3Model:
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
load_params['use_per_device'] = split
+ # Tensor-parallelism
+ if shared.args.enable_tp:
+ load_params['tensor_p'] = True
+ load_params['tp_backend'] = shared.args.tp_backend
+
model.load(**load_params)
tokenizer = Tokenizer.from_config(config)
diff --git a/modules/loaders.py b/modules/loaders.py
index 8b7e6cce..295db1e7 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -65,6 +65,8 @@ loaders_and_params = OrderedDict({
'draft_max',
'ctx_size_draft',
'speculative_decoding_accordion',
+ 'enable_tp',
+ 'tp_backend',
],
'ExLlamav2_HF': [
'ctx_size',
diff --git a/modules/shared.py b/modules/shared.py
index a1f4571e..644261a0 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -101,6 +101,11 @@ group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of
group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
+# ExLlamaV3
+group = parser.add_argument_group('ExLlamaV3')
+group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) to split the model across GPUs.')
+group.add_argument('--tp-backend', type=str, default='native', help='The backend for tensor parallelism. Valid options: native, nccl. Default: native.')
+
# ExLlamaV2
group = parser.add_argument_group('ExLlamaV2')
group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
@@ -110,7 +115,6 @@ group.add_argument('--no_flash_attn', action='store_true', help='Force flash-att
group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
-group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
# TensorRT-LLM
group = parser.add_argument_group('TensorRT-LLM')
diff --git a/modules/ui.py b/modules/ui.py
index 1171cd48..502005e7 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -155,6 +155,7 @@ def list_model_elements():
'bf16',
'autosplit',
'enable_tp',
+ 'tp_backend',
'no_flash_attn',
'no_xformers',
'no_sdpa',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 6972a17e..dd240627 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -46,6 +46,8 @@ def create_ui():
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
+ shared.gradio['tp_backend'] = gr.Dropdown(label="tp-backend", choices=['native', 'nccl'], value=shared.args.tp_backend, info='The backend for tensor parallelism.')
+
with gr.Column():
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
@@ -54,7 +56,7 @@ def create_ui():
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
- shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
+ shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).')
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
From a633793a0007d435ed4f2cf08f0fbb4b77651b91 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 13:19:42 -0700
Subject: [PATCH 59/79] Bump exllamav3 to 0.0.6
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 2 +-
requirements/full/requirements_apple_silicon.txt | 2 +-
requirements/full/requirements_cuda128.txt | 4 ++--
requirements/full/requirements_cuda128_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
6 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index eb7742b1..d0282ee9 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -36,8 +36,8 @@ tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 0890b2a5..13361a78 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -35,5 +35,5 @@ tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index da3010c6..e4a30168 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -36,5 +36,5 @@ tiktoken
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index 807d0a21..eeee9ff3 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -36,8 +36,8 @@ tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index 41e96574..d08f23ca 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -36,8 +36,8 @@ tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 72ba7103..71fd227d 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -36,8 +36,8 @@ tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
From 9651b5c873649e5c967142f9f78e7ad6cf59aaf5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 13:22:55 -0700
Subject: [PATCH 60/79] Make CUDA 12.8 the default CUDA option, remove the CUDA
12.4 option
Exllamav3 doesn't compile with torch 2.6 anymore, and torch 2.7
requires newer CUDA.
---
one_click.py | 20 +++++++--
requirements/full/requirements.txt | 14 +++---
requirements/full/requirements_cuda128.txt | 45 -------------------
.../full/requirements_cuda128_noavx2.txt | 45 -------------------
requirements/full/requirements_noavx2.txt | 14 +++---
5 files changed, 30 insertions(+), 108 deletions(-)
delete mode 100644 requirements/full/requirements_cuda128.txt
delete mode 100644 requirements/full/requirements_cuda128_noavx2.txt
diff --git a/one_click.py b/one_click.py
index 050da76b..486e893e 100644
--- a/one_click.py
+++ b/one_click.py
@@ -16,7 +16,7 @@ import sys
# os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030'
# Define the required versions
-TORCH_VERSION = "2.6.0"
+TORCH_VERSION = "2.7.0"
PYTHON_VERSION = "3.11"
LIBSTDCXX_VERSION_LINUX = "12.1.0"
@@ -113,17 +113,16 @@ def get_gpu_choice():
choice = get_user_choice(
"What is your GPU?",
{
- 'A': 'NVIDIA - CUDA 12.4',
+ 'A': 'NVIDIA',
'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
'C': 'Apple M Series',
'D': 'Intel Arc (beta)',
- 'E': 'NVIDIA - CUDA 12.8',
'N': 'CPU mode'
},
)
# Convert choice to GPU name
- gpu_choice = {"A": "NVIDIA", "B": "AMD", "C": "APPLE", "D": "INTEL", "E": "NVIDIA_CUDA128", "N": "NONE"}[choice]
+ gpu_choice = {"A": "NVIDIA_CUDA128", "B": "AMD", "C": "APPLE", "D": "INTEL", "N": "NONE"}[choice]
# Save choice to state
state['gpu_choice'] = gpu_choice
@@ -368,6 +367,19 @@ def update_requirements(initial_installation=False, pull=True):
assert_success=True
)
+ # Check for outdated CUDA 12.4 installs and refuse to update
+ state = load_state()
+ if state.get('gpu_choice') == 'NVIDIA':
+ print_big_message(
+ "Your current installation uses CUDA 12.4, which has been removed.\n"
+ "To update to the new default (CUDA 12.8), a clean installation is required.\n\n"
+ "INSTRUCTIONS:\n"
+ "1. Delete the 'installer_files' folder in your text-generation-webui directory.\n"
+ "2. Run the start script again (e.g., start_windows.bat).\n\n"
+ "This will create a fresh environment with the latest software."
+ )
+ sys.exit(0)
+
current_commit = get_current_commit()
wheels_changed = not os.path.exists(state_file)
if not wheels_changed:
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index d0282ee9..eeee9ff3 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -24,7 +24,7 @@ scipy
sentencepiece
tensorboard
transformers==4.55.*
-triton-windows==3.2.0.post19; platform_system == "Windows"
+triton-windows==3.3.1.post19; platform_system == "Windows"
tqdm
wandb
@@ -36,10 +36,10 @@ tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
deleted file mode 100644
index eeee9ff3..00000000
--- a/requirements/full/requirements_cuda128.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-accelerate==1.8.*
-bitsandbytes==0.46.*
-colorama
-datasets
-einops
-fastapi==0.112.4
-gradio==4.37.*
-html2text==2025.4.15
-jinja2==3.1.6
-markdown
-numpy==2.2.*
-pandas
-peft==0.16.*
-Pillow>=9.5.0
-psutil
-pydantic==2.8.2
-PyPDF2==3.0.1
-python-docx==1.1.2
-pyyaml
-requests
-rich
-safetensors==0.5.*
-scipy
-sentencepiece
-tensorboard
-transformers==4.55.*
-triton-windows==3.3.1.post19; platform_system == "Windows"
-tqdm
-wandb
-
-# API
-flask_cloudflared==0.0.14
-sse-starlette==1.6.5
-tiktoken
-
-# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
deleted file mode 100644
index d08f23ca..00000000
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-accelerate==1.8.*
-bitsandbytes==0.46.*
-colorama
-datasets
-einops
-fastapi==0.112.4
-gradio==4.37.*
-html2text==2025.4.15
-jinja2==3.1.6
-markdown
-numpy==2.2.*
-pandas
-peft==0.16.*
-Pillow>=9.5.0
-psutil
-pydantic==2.8.2
-PyPDF2==3.0.1
-python-docx==1.1.2
-pyyaml
-requests
-rich
-safetensors==0.5.*
-scipy
-sentencepiece
-tensorboard
-transformers==4.55.*
-triton-windows==3.3.1.post19; platform_system == "Windows"
-tqdm
-wandb
-
-# API
-flask_cloudflared==0.0.14
-sse-starlette==1.6.5
-tiktoken
-
-# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 71fd227d..d08f23ca 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -24,7 +24,7 @@ scipy
sentencepiece
tensorboard
transformers==4.55.*
-triton-windows==3.2.0.post19; platform_system == "Windows"
+triton-windows==3.3.1.post19; platform_system == "Windows"
tqdm
wandb
@@ -36,10 +36,10 @@ tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
From 3a91ca2dd191716be9e9f3f20627c1e1a80d13f1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 13:57:23 -0700
Subject: [PATCH 61/79] Update flash attention
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index eeee9ff3..d57a457c 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -41,5 +41,5 @@ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.7cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index d08f23ca..b073a3a9 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -41,5 +41,5 @@ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.7cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
From 64eba9576cb806d2213b7efbb82469aa70a9fd71 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 14:08:40 -0700
Subject: [PATCH 62/79] mtmd: Fix a bug when "include past attachments" is
unchecked
---
modules/chat.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/modules/chat.py b/modules/chat.py
index 7b1629dd..ab6b43c0 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -269,7 +269,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
enhanced_user_msg = user_msg
# Add attachment content if present AND if past attachments are enabled
- if (state.get('include_past_attachments', True) and user_key in metadata and "attachments" in metadata[user_key]):
+ if user_key in metadata and "attachments" in metadata[user_key]:
attachments_text = ""
image_refs = ""
@@ -277,7 +277,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
if attachment.get("type") == "image":
# Add image reference for multimodal models
image_refs += "<__media__>"
- else:
+ elif state.get('include_past_attachments', True):
# Handle text/PDF attachments
filename = attachment.get("name", "file")
content = attachment.get("content", "")
From 58797a9eb5f386cc8262f6d8f1a152494249c28d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 14:17:20 -0700
Subject: [PATCH 63/79] Minor change after
9651b5c873649e5c967142f9f78e7ad6cf59aaf5
---
one_click.py | 12 ++----------
1 file changed, 2 insertions(+), 10 deletions(-)
diff --git a/one_click.py b/one_click.py
index 486e893e..67be9e4b 100644
--- a/one_click.py
+++ b/one_click.py
@@ -135,9 +135,7 @@ def get_pytorch_install_command(gpu_choice):
"""Get PyTorch installation command based on GPU choice"""
base_cmd = f"python -m pip install torch=={TORCH_VERSION} "
- if gpu_choice == "NVIDIA":
- return base_cmd + "--index-url https://download.pytorch.org/whl/cu124"
- elif gpu_choice == "NVIDIA_CUDA128":
+ if gpu_choice == "NVIDIA_CUDA128":
return "python -m pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128"
elif gpu_choice == "AMD":
return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.2.4"
@@ -156,9 +154,7 @@ def get_pytorch_update_command(gpu_choice):
"""Get PyTorch update command based on GPU choice"""
base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} "
- if gpu_choice == "NVIDIA":
- return f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
- elif gpu_choice == "NVIDIA_CUDA128":
+ if gpu_choice == "NVIDIA_CUDA128":
return "python -m pip install --upgrade torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128"
elif gpu_choice == "AMD":
return f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
@@ -181,8 +177,6 @@ def get_requirements_file(gpu_choice):
file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt"
elif gpu_choice in ["INTEL", "NONE"]:
file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt"
- elif gpu_choice == "NVIDIA":
- file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt"
elif gpu_choice == "NVIDIA_CUDA128":
file_name = f"requirements_cuda128{'_noavx2' if not cpu_has_avx2() else ''}.txt"
else:
@@ -330,8 +324,6 @@ def install_webui():
cmd_flags_file.write("\n--cpu\n")
# Handle CUDA version display
- elif any((is_windows(), is_linux())) and gpu_choice == "NVIDIA":
- print("CUDA: 12.4")
elif any((is_windows(), is_linux())) and gpu_choice == "NVIDIA_CUDA128":
print("CUDA: 12.8")
From 35707c2dd89a9983f2038ded9bc67d13aa7bc213 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 21:39:57 -0700
Subject: [PATCH 64/79] Update README
---
README.md | 38 +++++++++++++++++++-------------------
1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/README.md b/README.md
index 5e7f37de..93d31131 100644
--- a/README.md
+++ b/README.md
@@ -32,13 +32,13 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
## How to install
-#### Option 1: Portable builds (get started in 1 minute)
+#### ✅ Option 1: Portable builds (get started in 1 minute)
No installation needed – just download, unzip and run. All dependencies included.
Compatible with GGUF (llama.cpp) models on Windows, Linux, and macOS.
-Download from here: https://github.com/oobabooga/text-generation-webui/releases
+Download from here: **https://github.com/oobabooga/text-generation-webui/releases**
#### Option 2: One-click installer
@@ -57,23 +57,6 @@ You can pass command-line flags directly (e.g., `./start_linux.sh --help`), or a
To update, run the update script for your OS: `update_wizard_windows.bat`, `update_wizard_linux.sh`, or `update_wizard_macos.sh`.
-
-
-One-click installer details
-
-
-### One-click-installer
-
-The script uses Miniforge to set up a Conda environment in the `installer_files` folder.
-
-If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`.
-
-* There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
-* To install requirements for extensions, it is recommended to use the update wizard script with the "Install/update extensions requirements" option. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.
-* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
-
-
-
Manual portable installation with venv
@@ -108,6 +91,23 @@ deactivate
```
+
+
+One-click installer details
+
+
+### One-click-installer
+
+The script uses Miniforge to set up a Conda environment in the `installer_files` folder.
+
+If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`.
+
+* There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
+* To install requirements for extensions, it is recommended to use the update wizard script with the "Install/update extensions requirements" option. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.
+* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
+
+
+
Manual full installation with conda or docker
From 3dec47eaf8f94ba085e7d4d06522ce398d04bdbe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 21:43:46 -0700
Subject: [PATCH 65/79] Small one-click installer changes
---
one_click.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/one_click.py b/one_click.py
index 67be9e4b..1ea5e59f 100644
--- a/one_click.py
+++ b/one_click.py
@@ -16,7 +16,7 @@ import sys
# os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030'
# Define the required versions
-TORCH_VERSION = "2.7.0"
+TORCH_VERSION = "2.7.1"
PYTHON_VERSION = "3.11"
LIBSTDCXX_VERSION_LINUX = "12.1.0"
@@ -136,7 +136,7 @@ def get_pytorch_install_command(gpu_choice):
base_cmd = f"python -m pip install torch=={TORCH_VERSION} "
if gpu_choice == "NVIDIA_CUDA128":
- return "python -m pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128"
+ return base_cmd + "--index-url https://download.pytorch.org/whl/cu128"
elif gpu_choice == "AMD":
return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.2.4"
elif gpu_choice in ["APPLE", "NONE"]:
From 320f7339cdd3bf52ff705e0679d55fd738e61218 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 21:56:35 -0700
Subject: [PATCH 66/79] Update README
---
README.md | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index 93d31131..3832030c 100644
--- a/README.md
+++ b/README.md
@@ -139,19 +139,19 @@ conda activate textgen
| System | GPU | Command |
|--------|---------|---------|
-| Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
-| Linux/WSL | CPU only | `pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` |
-| MacOS + MPS | Any | `pip3 install torch==2.6.0` |
-| Windows | NVIDIA | `pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
-| Windows | CPU only | `pip3 install torch==2.6.0` |
+| Linux/WSL | NVIDIA | `pip3 install torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128` |
+| Linux/WSL | CPU only | `pip3 install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu` |
+| Linux | AMD | `pip3 install torch==2.7.1 --index-url https://download.pytorch.org/whl/rocm6.2.4` |
+| MacOS + MPS | Any | `pip3 install torch==2.7.1` |
+| Windows | NVIDIA | `pip3 install torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128` |
+| Windows | CPU only | `pip3 install torch==2.7.1` |
The up-to-date commands can be found here: https://pytorch.org/get-started/locally/.
If you need `nvcc` to compile some library manually, you will additionally need to install this:
```
-conda install -y -c "nvidia/label/cuda-12.4.1" cuda
+conda install -y -c "nvidia/label/cuda-12.8.1" cuda
```
#### 3. Install the web UI
From 6bf31479d92ab9da0c643ca907a90b300a230f25 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 22:00:21 -0700
Subject: [PATCH 67/79] Update README
---
README.md | 60 +++++++++++++++++++++++++++----------------------------
1 file changed, 29 insertions(+), 31 deletions(-)
diff --git a/README.md b/README.md
index 3832030c..ccde678a 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,35 @@ Compatible with GGUF (llama.cpp) models on Windows, Linux, and macOS.
Download from here: **https://github.com/oobabooga/text-generation-webui/releases**
-#### Option 2: One-click installer
+#### Option 2: Manual portable install with venv
+
+Very fast setup that should work on any Python 3.9+:
+
+```bash
+# Clone repository
+git clone https://github.com/oobabooga/text-generation-webui
+cd text-generation-webui
+
+# Create virtual environment
+python -m venv venv
+
+# Activate virtual environment
+# On Windows:
+venv\Scripts\activate
+# On macOS/Linux:
+source venv/bin/activate
+
+# Install dependencies (choose appropriate file under requirements/portable for your hardware)
+pip install -r requirements/portable/requirements.txt --upgrade
+
+# Launch server (basic command)
+python server.py --portable --api --auto-launch
+
+# When done working, deactivate
+deactivate
+```
+
+#### Option 3: One-click installer
For users who need additional backends (ExLlamaV3, Transformers) or extensions (TTS, voice input, translation, etc). Requires ~10GB disk space and downloads PyTorch.
@@ -62,36 +90,6 @@ To update, run the update script for your OS: `update_wizard_windows.bat`, `upda
Manual portable installation with venv
-### Manual portable installation with venv
-
-Very fast setup that should work on any Python 3.9+:
-
-```bash
-# Clone repository
-git clone https://github.com/oobabooga/text-generation-webui
-cd text-generation-webui
-
-# Create virtual environment
-python -m venv venv
-
-# Activate virtual environment
-# On Windows:
-venv\Scripts\activate
-# On macOS/Linux:
-source venv/bin/activate
-
-# Install dependencies (choose appropriate file under requirements/portable for your hardware)
-pip install -r requirements/portable/requirements.txt
-
-# Launch server (basic command)
-python server.py --portable --api --auto-launch
-
-# When done working, deactivate
-deactivate
-```
-
-
-
One-click installer details
From 8cdb911a6e637c355dc9eac2ab43f94eab7b3281 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 22:06:12 -0700
Subject: [PATCH 68/79] Update README
---
README.md | 29 +++++++++++++----------------
1 file changed, 13 insertions(+), 16 deletions(-)
diff --git a/README.md b/README.md
index ccde678a..f213f7a9 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,6 @@
A Gradio web UI for Large Language Models.
-Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
-
[Try the Deep Reason extension](https://oobabooga.gumroad.com/l/deep_reason)
| |  |
@@ -79,16 +77,11 @@ For users who need additional backends (ExLlamaV3, Transformers) or extensions (
To restart the web UI later, run the same `start_` script.
-To reinstall with a fresh Python environment, delete the `installer_files` folder and run the `start_` script again.
-
You can pass command-line flags directly (e.g., `./start_linux.sh --help`), or add them to `user_data/CMD_FLAGS.txt` (e.g., `--api` to enable the API).
To update, run the update script for your OS: `update_wizard_windows.bat`, `update_wizard_linux.sh`, or `update_wizard_macos.sh`.
-
-
-Manual portable installation with venv
-
+To reinstall with a fresh Python environment, delete the `installer_files` folder and run the `start_` script again.
One-click installer details
@@ -236,13 +229,13 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [-
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
[--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant]
[--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock]
- [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
- [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
- [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
- [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
- [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
- [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY]
- [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
+ [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--mmproj MMPROJ] [--ctx-size N] [--cache-type N]
+ [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--enable-tp]
+ [--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner] [--deepspeed]
+ [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen]
+ [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE]
+ [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY]
+ [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
Text generation web UI
@@ -299,6 +292,7 @@ llama.cpp:
--row-split Split the model by rows across GPUs. This may improve multi-gpu performance.
--extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
--streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
+ --mmproj MMPROJ Path to the mmproj file for vision models.
Context and cache:
--ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens.
@@ -312,6 +306,10 @@ Speculative decoding:
--device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1
--ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model.
+ExLlamaV3:
+ --enable-tp, --enable_tp Enable Tensor Parallelism (TP) to split the model across GPUs.
+ --tp-backend TP_BACKEND The backend for tensor parallelism. Valid options: native, nccl. Default: native.
+
ExLlamaV2:
--gpu-split GPU_SPLIT Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
--autosplit Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.
@@ -320,7 +318,6 @@ ExLlamaV2:
--no_xformers Force xformers to not be used.
--no_sdpa Force Torch SDPA to not be used.
--num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral.
- --enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2.
TensorRT-LLM:
--cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
From 8a14aa62ff369615de895d155b0a105a3b4f7cb8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 22:06:59 -0700
Subject: [PATCH 69/79] Update README
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index f213f7a9..6e5cb5d8 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,7 @@ To update, run the update script for your OS: `update_wizard_windows.bat`, `upda
To reinstall with a fresh Python environment, delete the `installer_files` folder and run the `start_` script again.
+
One-click installer details
From 6b1b2e2373df2a17ac48eaf5c53494aa8f4b8a57 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 22:19:20 -0700
Subject: [PATCH 70/79] Update README
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 6e5cb5d8..6b49cee0 100644
--- a/README.md
+++ b/README.md
@@ -378,7 +378,7 @@ text-generation-webui
└── llama-2-13b-chat.Q4_K_M.gguf
```
-* The remaining model types (like 16-bit Transformers models and EXL2 models) are made of several files and must be placed in a subfolder. Example:
+* The remaining model types (like 16-bit Transformers models and EXL3 models) are made of several files and must be placed in a subfolder. Example:
```
text-generation-webui
From 15f99b1b710aced1ce8db70748a2a82602457661 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 18 Aug 2025 05:51:04 -0700
Subject: [PATCH 71/79] Installer: Fix a requirement file
---
one_click.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/one_click.py b/one_click.py
index 1ea5e59f..2c7c2c0c 100644
--- a/one_click.py
+++ b/one_click.py
@@ -171,14 +171,14 @@ def get_requirements_file(gpu_choice):
"""Get requirements file path based on GPU choice"""
requirements_base = os.path.join("requirements", "full")
- if gpu_choice == "AMD":
+ if gpu_choice == "NVIDIA_CUDA128":
+ file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt"
+ elif gpu_choice == "AMD":
file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt"
elif gpu_choice == "APPLE":
file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt"
elif gpu_choice in ["INTEL", "NONE"]:
file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt"
- elif gpu_choice == "NVIDIA_CUDA128":
- file_name = f"requirements_cuda128{'_noavx2' if not cpu_has_avx2() else ''}.txt"
else:
raise ValueError(f"Unknown GPU choice: {gpu_choice}")
From 08594e52636d6a6d583a87ecb6fd10e49821c500 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 18 Aug 2025 05:59:46 -0700
Subject: [PATCH 72/79] Installer: Slight improvement
---
one_click.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/one_click.py b/one_click.py
index 2c7c2c0c..58c0e8ff 100644
--- a/one_click.py
+++ b/one_click.py
@@ -408,7 +408,7 @@ def update_requirements(initial_installation=False, pull=True):
with open(requirements_file, 'r') as f:
after_pull_whl_lines = [line for line in f if '.whl' in line]
- wheels_changed = wheels_changed or (before_pull_whl_lines != after_pull_whl_lines)
+ wheels_changed = wheels_changed or (before_pull_whl_lines != after_pull_whl_lines)
# Check for changes to installer files
for file in files_to_check:
From 7d23a55901a43c323d2afe6d8c4585e7c9c3bca2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 18 Aug 2025 09:05:47 -0700
Subject: [PATCH 73/79] Fix model unloading when switching loaders (closes
#7203)
---
modules/models.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/modules/models.py b/modules/models.py
index 938eed3d..e620957b 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -128,10 +128,12 @@ def unload_model(keep_model_name=False):
if shared.model is None:
return
- is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
- if shared.args.loader in ['ExLlamav3_HF', 'ExLlamav3']:
+ model_class_name = shared.model.__class__.__name__
+ is_llamacpp = (model_class_name == 'LlamaServer')
+
+ if model_class_name in ['Exllamav3Model', 'Exllamav3HF']:
shared.model.unload()
- elif shared.args.loader in ['ExLlamav2_HF', 'ExLlamav2'] and hasattr(shared.model, 'unload'):
+ elif model_class_name in ['Exllamav2Model', 'Exllamav2HF'] and hasattr(shared.model, 'unload'):
shared.model.unload()
shared.model = shared.tokenizer = None
From 8805a50d24066dd5645b2dbb85595bc07d75c34c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 18 Aug 2025 15:31:01 -0700
Subject: [PATCH 74/79] Update llama.cpp
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_amd_noavx2.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 6 +++---
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 6 +++---
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/portable/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
requirements/portable/requirements_vulkan_noavx2.txt | 4 ++--
16 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index d57a457c..9f906b26 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 47bcb60a..70e031b8 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 6958ce37..81556326 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 13361a78..7b9d3650 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index e4a30168..0fc9162f 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 3a9a953b..3565a994 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index a3e176d3..64c17416 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index b073a3a9..2b162308 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 0c7f1d29..943ea600 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 09f1c502..394b89b6 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 75296cb4..cffe3aea 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index ff3d7cb1..d274e2c8 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 97414bde..47ec086e 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 7f543205..9a0a3694 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index c1764ead..45e96da9 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 142b67ec..9183562e 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From cbba58bef9f70a366413cce145a907658a24a982 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 18 Aug 2025 15:50:09 -0700
Subject: [PATCH 75/79] UI: Fix code blocks having an extra empty line
---
modules/html_generator.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/modules/html_generator.py b/modules/html_generator.py
index cb14a722..279f9ba6 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -306,6 +306,9 @@ def process_markdown_content(string):
# Convert to HTML using markdown
html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])
+ # Remove extra newlines before
+ html_output = re.sub(r'\s*', '', html_output)
+
# Unescape code blocks
pattern = re.compile(r']*>(.*?)', re.DOTALL)
html_output = pattern.sub(lambda x: html.unescape(x.group()), html_output)
From 5b06284a8af7d5bf068210124797fa7e4b31ade4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 19 Aug 2025 06:23:21 -0700
Subject: [PATCH 76/79] UI: Keep ExLlamav3_HF selected if already selected for
EXL3 models
---
modules/models_settings.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/models_settings.py b/modules/models_settings.py
index bf7b1cf9..c325fa0c 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -251,7 +251,7 @@ def apply_model_settings_to_state(model, state):
model_settings = get_model_metadata(model)
if 'loader' in model_settings:
loader = model_settings.pop('loader')
- if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
+ if not ((loader == 'ExLlamav2_HF' and state['loader'] == 'ExLlamav2') or (loader == 'ExLlamav3_HF' and state['loader'] == 'ExLlamav3')):
state['loader'] = loader
for k in model_settings:
From e0f5905a97bd40a343003b4626e08b3fec9416de Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 19 Aug 2025 06:34:05 -0700
Subject: [PATCH 77/79] Code formatting
---
modules/exllamav3.py | 3 ++-
one_click.py | 2 +-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 73962977..fd676a00 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -236,11 +236,12 @@ class Exllamav3Model:
"""
Generate text with streaming using native ExLlamaV3 API
"""
- image_embeddings = []
if shared.is_multimodal:
# Process images and modify prompt (ExLlamaV3-specific)
prompt, image_embeddings = self._process_images_for_generation(prompt, state)
+ else:
+ image_embeddings = []
# Greedy decoding is a special case
if state['temperature'] == 0:
diff --git a/one_click.py b/one_click.py
index 58c0e8ff..881d7489 100644
--- a/one_click.py
+++ b/one_click.py
@@ -155,7 +155,7 @@ def get_pytorch_update_command(gpu_choice):
base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} "
if gpu_choice == "NVIDIA_CUDA128":
- return "python -m pip install --upgrade torch==2.7.1 --index-url https://download.pytorch.org/whl/cu128"
+ return f"{base_cmd} --index-url https://download.pytorch.org/whl/cu128"
elif gpu_choice == "AMD":
return f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
elif gpu_choice in ["APPLE", "NONE"]:
From 1972479610f4b1482912ff012469e8ab9cbaa908 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 19 Aug 2025 06:48:22 -0700
Subject: [PATCH 78/79] Add the TP option to exllamav3_HF
---
modules/exllamav3_hf.py | 5 +++++
modules/loaders.py | 2 ++
2 files changed, 7 insertions(+)
diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index 1254ff5d..d9f4ed57 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -74,6 +74,11 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
load_params['use_per_device'] = split
+ # Tensor-parallelism
+ if shared.args.enable_tp:
+ load_params['tensor_p'] = True
+ load_params['tp_backend'] = shared.args.tp_backend
+
self.ex_model.load(**load_params)
self.past_seq = None
self.max_tokens = max_tokens
diff --git a/modules/loaders.py b/modules/loaders.py
index 295db1e7..f88e976d 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -56,6 +56,8 @@ loaders_and_params = OrderedDict({
'cfg_cache',
'trust_remote_code',
'no_use_fast',
+ 'enable_tp',
+ 'tp_backend',
],
'ExLlamav3': [
'ctx_size',
From 9e7b326e3402de37adadc8509764738e98113763 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 19 Aug 2025 06:50:40 -0700
Subject: [PATCH 79/79] Lint
---
modules/models.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/models.py b/modules/models.py
index e620957b..ca3d184f 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -55,7 +55,7 @@ def load_model(model_name, loader=None):
if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp':
shared.settings['truncation_length'] = shared.args.ctx_size
- shared.is_multimodal = False
+ shared.is_multimodal = False
if loader.lower() in ('exllamav3', 'llama.cpp'):
shared.is_multimodal = model.is_multimodal()