From b391ac8eb1ba63e449f0ef021db56d6513dce646 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Aug 2025 17:51:24 -0700
Subject: [PATCH 01/51] Fix getting the ctx-size for EXL3/EXL2/Transformers
 models

---
 modules/models_settings.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index e35e1c04..4e53dc81 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -106,9 +106,16 @@ def get_model_metadata(model):
 
             for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']:
                 if k in metadata:
-                    model_settings['truncation_length'] = metadata[k]
-                    model_settings['truncation_length_info'] = metadata[k]
-                    model_settings['ctx_size'] = min(metadata[k], 8192)
+                    value = metadata[k]
+                elif k in metadata.get('text_config', {}):
+                    value = metadata['text_config'][k]
+                else:
+                    continue
+
+                model_settings['truncation_length'] = value
+                model_settings['truncation_length_info'] = value
+                model_settings['ctx_size'] = min(value, 8192)
+                break
 
             if 'rope_theta' in metadata:
                 model_settings['rope_freq_base'] = metadata['rope_theta']

From 88127f46c124723554b5e87cad9c868348ed4c53 Mon Sep 17 00:00:00 2001
From: Katehuuh <133996730+Katehuuh@users.noreply.github.com>
Date: Sat, 9 Aug 2025 04:31:16 +0200
Subject: [PATCH 02/51] Add multimodal support (ExLlamaV3) (#7174)

---
 css/main.css                     |  13 ++
 docs/12 - OpenAI API.md          |  18 ++
 extensions/openai/completions.py | 102 +++++++++-
 extensions/openai/image_utils.py |  97 ++++++++++
 extensions/openai/typing.py      |  16 +-
 modules/chat.py                  | 126 ++++++++++---
 modules/exllamav3.py             | 313 +++++++++++++++++++++++++++++++
 modules/html_generator.py        |  29 ++-
 modules/loaders.py               |  40 ++++
 modules/models.py                |  13 +-
 modules/shared.py                |   2 +
 modules/text_generation.py       |  10 +-
 modules/ui_chat.py               |   2 +-
 13 files changed, 726 insertions(+), 55 deletions(-)
 create mode 100644 extensions/openai/image_utils.py
 create mode 100644 modules/exllamav3.py

diff --git a/css/main.css b/css/main.css
index 240a94d5..de16d81d 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1577,6 +1577,19 @@ strong {
     margin-top: 4px;
 }
 
+.image-attachment {
+    flex-direction: column;
+}
+
+.image-preview {
+    border-radius: 16px;
+    margin-bottom: 5px;
+    object-fit: cover;
+    object-position: center;
+    border: 2px solid var(--border-color-primary);
+    aspect-ratio: 1 / 1;
+}
+
 button:focus {
     outline: none;
 }
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index ec999397..b7b5fbc1 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -77,6 +77,24 @@ curl http://127.0.0.1:5000/v1/chat/completions \
   }'
 ```
 
+#### Multimodal support (ExLlamaV3)
+
+```shell
+curl http://127.0.0.1:5000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {"type": "text", "text": "What color is this image?"},
+          {"type": "image_url", "image_url": {"url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"}}
+        ]
+      }
+    ]
+  }'
+```
+
 #### SSE streaming
 
 ```shell
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 5181b18b..3d389f0b 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -7,6 +7,7 @@ import tiktoken
 from pydantic import ValidationError
 
 from extensions.openai.errors import InvalidRequestError
+from extensions.openai.image_utils import convert_openai_messages_to_images
 from extensions.openai.typing import ToolDefinition
 from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
 from modules import shared
@@ -16,6 +17,7 @@ from modules.chat import (
     load_character_memoized,
     load_instruction_template_memoized
 )
+from modules.logging_colors import logger
 from modules.presets import load_preset_memoized
 from modules.text_generation import decode, encode, generate_reply
 
@@ -82,6 +84,21 @@ def process_parameters(body, is_legacy=False):
     return generate_params
 
 
+def process_multimodal_content(content):
+    """Extract text from OpenAI multimodal format for non-multimodal models"""
+    if isinstance(content, str):
+        return content
+
+    if isinstance(content, list):
+        text_parts = []
+        for item in content:
+            if isinstance(item, dict) and item.get('type') == 'text':
+                text_parts.append(item.get('text', ''))
+        return ' '.join(text_parts) if text_parts else str(content)
+
+    return str(content)
+
+
 def convert_history(history):
     '''
     Chat histories in this program are in the format [message, reply].
@@ -99,8 +116,11 @@ def convert_history(history):
         role = entry["role"]
 
         if role == "user":
+            # Extract text content (images handled by model-specific code)
+            content = process_multimodal_content(content)
             user_input = content
             user_input_last = True
+
             if current_message:
                 chat_dialogue.append([current_message, '', ''])
                 current_message = ""
@@ -126,7 +146,11 @@ def convert_history(history):
     if not user_input_last:
         user_input = ""
 
-    return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue)}
+    return user_input, system_message, {
+        'internal': chat_dialogue,
+        'visible': copy.deepcopy(chat_dialogue),
+        'messages': history  # Store original messages for multimodal models
+    }
 
 
 def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, prompt_only=False) -> dict:
@@ -150,9 +174,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         elif m['role'] == 'function':
             raise InvalidRequestError(message="role: function is not supported.", param='messages')
 
-        if 'content' not in m and "image_url" not in m:
+        # Handle multimodal content validation
+        content = m.get('content')
+        if content is None:
             raise InvalidRequestError(message="messages: missing content", param='messages')
 
+        # Validate multimodal content structure
+        if isinstance(content, list):
+            for item in content:
+                if not isinstance(item, dict) or 'type' not in item:
+                    raise InvalidRequestError(message="messages: invalid content item format", param='messages')
+                if item['type'] not in ['text', 'image_url']:
+                    raise InvalidRequestError(message="messages: unsupported content type", param='messages')
+                if item['type'] == 'text' and 'text' not in item:
+                    raise InvalidRequestError(message="messages: missing text in content item", param='messages')
+                if item['type'] == 'image_url' and ('image_url' not in item or 'url' not in item['image_url']):
+                    raise InvalidRequestError(message="messages: missing image_url in content item", param='messages')
+
     # Chat Completions
     object_type = 'chat.completion' if not stream else 'chat.completion.chunk'
     created_time = int(time.time())
@@ -336,9 +374,26 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
 
     prompt_str = 'context' if is_legacy else 'prompt'
 
-    # ... encoded as a string, array of strings, array of tokens, or array of token arrays.
-    if prompt_str not in body:
-        raise InvalidRequestError("Missing required input", param=prompt_str)
+    # Handle both prompt and messages format for unified multimodal support
+    if prompt_str not in body or body[prompt_str] is None:
+        if 'messages' in body:
+            # Convert messages format to prompt for completions endpoint
+            prompt_text = ""
+            for message in body.get('messages', []):
+                if isinstance(message, dict) and 'content' in message:
+                    # Extract text content from multimodal messages
+                    content = message['content']
+                    if isinstance(content, str):
+                        prompt_text += content
+                    elif isinstance(content, list):
+                        for item in content:
+                            if isinstance(item, dict) and item.get('type') == 'text':
+                                prompt_text += item.get('text', '')
+
+            # Allow empty prompts for image-only requests
+            body[prompt_str] = prompt_text
+        else:
+            raise InvalidRequestError("Missing required input", param=prompt_str)
 
     # common params
     generate_params = process_parameters(body, is_legacy=is_legacy)
@@ -349,9 +404,18 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
     suffix = body['suffix'] if body['suffix'] else ''
     echo = body['echo']
 
+    # Add messages to generate_params if present for multimodal processing
+    if 'messages' in body:
+        generate_params['messages'] = body['messages']
+
     if not stream:
         prompt_arg = body[prompt_str]
-        if isinstance(prompt_arg, str) or (isinstance(prompt_arg, list) and isinstance(prompt_arg[0], int)):
+
+        # Handle empty/None prompts (e.g., image-only requests)
+        if prompt_arg is None:
+            prompt_arg = ""
+
+        if isinstance(prompt_arg, str) or (isinstance(prompt_arg, list) and len(prompt_arg) > 0 and isinstance(prompt_arg[0], int)):
             prompt_arg = [prompt_arg]
 
         resp_list_data = []
@@ -374,7 +438,19 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
 
             # generate reply #######################################
             debug_msg({'prompt': prompt, 'generate_params': generate_params})
-            generator = generate_reply(prompt, generate_params, is_chat=False)
+
+            # Use multimodal generation if images are present
+            if 'messages' in generate_params:
+                raw_images = convert_openai_messages_to_images(generate_params['messages'])
+                if raw_images:
+                    logger.info(f"Using multimodal generation for {len(raw_images)} images")
+                    generate_params['raw_images'] = raw_images
+                    generator = shared.model.generate_with_streaming(prompt, generate_params)
+                else:
+                    generator = generate_reply(prompt, generate_params, is_chat=False)
+            else:
+                generator = generate_reply(prompt, generate_params, is_chat=False)
+
             answer = ''
 
             for a in generator:
@@ -447,7 +523,17 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
 
         # generate reply #######################################
         debug_msg({'prompt': prompt, 'generate_params': generate_params})
-        generator = generate_reply(prompt, generate_params, is_chat=False)
+        # Use multimodal generation if images are present
+        if 'messages' in generate_params:
+            raw_images = convert_openai_messages_to_images(generate_params['messages'])
+            if raw_images:
+                logger.info(f"Using multimodal generation for {len(raw_images)} images")
+                generate_params['raw_images'] = raw_images
+                generator = shared.model.generate_with_streaming(prompt, generate_params)
+            else:
+                generator = generate_reply(prompt, generate_params, is_chat=False)
+        else:
+            generator = generate_reply(prompt, generate_params, is_chat=False)
 
         answer = ''
         seen_content = ''
diff --git a/extensions/openai/image_utils.py b/extensions/openai/image_utils.py
new file mode 100644
index 00000000..c54f0532
--- /dev/null
+++ b/extensions/openai/image_utils.py
@@ -0,0 +1,97 @@
+"""
+Shared image processing utilities for multimodal support.
+Used by both ExLlamaV3 and llama.cpp implementations.
+"""
+import base64
+import io
+from typing import Any, List, Tuple
+
+from PIL import Image
+
+from modules.logging_colors import logger
+
+
+def decode_base64_image(base64_string: str) -> Image.Image:
+    """Decodes a base64 string to a PIL Image."""
+    try:
+        if base64_string.startswith('data:image/'):
+            base64_string = base64_string.split(',', 1)[1]
+
+        image_data = base64.b64decode(base64_string)
+        image = Image.open(io.BytesIO(image_data))
+        return image
+    except Exception as e:
+        logger.error(f"Failed to decode base64 image: {e}")
+        raise ValueError(f"Invalid base64 image data: {e}")
+
+
+def process_message_content(content: Any) -> Tuple[str, List[Image.Image]]:
+    """
+    Processes message content that may contain text and images.
+    Returns: A tuple of (text_content, list_of_pil_images).
+    """
+    if isinstance(content, str):
+        return content, []
+
+    if isinstance(content, list):
+        text_parts = []
+        images = []
+        for item in content:
+            if not isinstance(item, dict):
+                continue
+
+            item_type = item.get('type', '')
+            if item_type == 'text':
+                text_parts.append(item.get('text', ''))
+            elif item_type == 'image_url':
+                image_url_data = item.get('image_url', {})
+                image_url = image_url_data.get('url', '')
+
+                if image_url.startswith('data:image/'):
+                    try:
+                        images.append(decode_base64_image(image_url))
+                    except Exception as e:
+                        logger.warning(f"Failed to process a base64 image: {e}")
+                elif image_url.startswith('http'):
+                    # Support external URLs
+                    try:
+                        import requests
+                        response = requests.get(image_url, timeout=10)
+                        response.raise_for_status()
+                        image_data = response.content
+                        image = Image.open(io.BytesIO(image_data))
+                        images.append(image)
+                        logger.info("Successfully loaded external image from URL")
+                    except Exception as e:
+                        logger.warning(f"Failed to fetch external image: {e}")
+                else:
+                    logger.warning(f"Unsupported image URL format: {image_url[:70]}...")
+
+        return ' '.join(text_parts), images
+
+    return str(content), []
+
+
+def convert_image_attachments_to_pil(image_attachments: List[dict]) -> List[Image.Image]:
+    """Convert webui image_attachments format to PIL Images."""
+    pil_images = []
+    for attachment in image_attachments:
+        if attachment.get('type') == 'image' and 'image_data' in attachment:
+            try:
+                image = decode_base64_image(attachment['image_data'])
+                if image.mode != 'RGB':
+                    image = image.convert('RGB')
+                pil_images.append(image)
+            except Exception as e:
+                logger.warning(f"Failed to process image attachment: {e}")
+    return pil_images
+
+
+def convert_openai_messages_to_images(messages: List[dict]) -> List[Image.Image]:
+    """Convert OpenAI messages format to PIL Images."""
+    all_images = []
+    for message in messages:
+        if isinstance(message, dict) and 'content' in message:
+            _, images = process_message_content(message['content'])
+            all_images.extend(images)
+    return all_images
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 6bd3749f..e9f92da5 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -2,7 +2,7 @@ import json
 import time
 from typing import Dict, List, Optional
 
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, field_validator, validator
 
 
 class GenerationOptions(BaseModel):
@@ -99,7 +99,8 @@ class ToolCall(BaseModel):
 
 class CompletionRequestParams(BaseModel):
     model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
-    prompt: str | List[str]
+    prompt: str | List[str] | None = Field(default=None, description="Text prompt for completion. Can also use 'messages' format for multimodal.")
+    messages: List[dict] | None = Field(default=None, description="OpenAI messages format for multimodal support. Alternative to 'prompt'.")
     best_of: int | None = Field(default=1, description="Unused parameter.")
     echo: bool | None = False
     frequency_penalty: float | None = 0
@@ -115,6 +116,17 @@ class CompletionRequestParams(BaseModel):
     top_p: float | None = 1
     user: str | None = Field(default=None, description="Unused parameter.")
 
+    @field_validator('prompt', 'messages')
+    @classmethod
+    def validate_prompt_or_messages(cls, v, info):
+        """Ensure either 'prompt' or 'messages' is provided for completions."""
+        if info.field_name == 'prompt':  # If we're validating 'prompt', check if neither prompt nor messages will be set
+            messages = info.data.get('messages')
+            if v is None and messages is None:
+                raise ValueError("Either 'prompt' or 'messages' must be provided")
+
+        return v
+
 
 class CompletionRequest(GenerationOptions, CompletionRequestParams):
     pass
diff --git a/modules/chat.py b/modules/chat.py
index 1ab91b5e..354ae46b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -271,16 +271,27 @@ def generate_chat_prompt(user_input, state, **kwargs):
             # Add attachment content if present AND if past attachments are enabled
             if (state.get('include_past_attachments', True) and user_key in metadata and "attachments" in metadata[user_key]):
                 attachments_text = ""
-                for attachment in metadata[user_key]["attachments"]:
-                    filename = attachment.get("name", "file")
-                    content = attachment.get("content", "")
-                    if attachment.get("type") == "text/html" and attachment.get("url"):
-                        attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
-                    else:
-                        attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+                image_refs = ""
 
-                if attachments_text:
-                    enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}"
+                for attachment in metadata[user_key]["attachments"]:
+                    if attachment.get("type") == "image":
+                        # Add image reference for multimodal models
+                        image_refs += "<__media__>"
+                    else:
+                        # Handle text/PDF attachments
+                        filename = attachment.get("name", "file")
+                        content = attachment.get("content", "")
+                        if attachment.get("type") == "text/html" and attachment.get("url"):
+                            attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
+                        else:
+                            attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+                if image_refs or attachments_text:
+                    enhanced_user_msg = user_msg
+                    if image_refs:
+                        enhanced_user_msg += f" {image_refs}"
+                    if attachments_text:
+                        enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
 
             messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
 
@@ -301,16 +312,23 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
             if user_key in metadata and "attachments" in metadata[user_key]:
                 attachments_text = ""
-                for attachment in metadata[user_key]["attachments"]:
-                    filename = attachment.get("name", "file")
-                    content = attachment.get("content", "")
-                    if attachment.get("type") == "text/html" and attachment.get("url"):
-                        attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
-                    else:
-                        attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+                image_refs = ""
 
-                if attachments_text:
-                    user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}"
+                for attachment in metadata[user_key]["attachments"]:
+                    if attachment.get("type") == "image":
+                        image_refs += "<__media__>"
+                    else:
+                        filename = attachment.get("name", "file")
+                        content = attachment.get("content", "")
+                        if attachment.get("type") == "text/html" and attachment.get("url"):
+                            attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
+                        else:
+                            attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+                if image_refs or attachments_text:
+                    user_input = f"{user_input} {image_refs}"
+                    if attachments_text:
+                        user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
 
         messages.append({"role": "user", "content": user_input})
 
@@ -594,29 +612,64 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
     file_extension = path.suffix.lower()
 
     try:
-        # Handle different file types
-        if file_extension == '.pdf':
+        # Handle image files
+        if file_extension in ['.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif']:
+            # Convert image to base64
+            with open(path, 'rb') as f:
+                image_data = base64.b64encode(f.read()).decode('utf-8')
+
+            # Determine MIME type from extension
+            mime_type_map = {
+                '.jpg': 'image/jpeg',
+                '.jpeg': 'image/jpeg',
+                '.png': 'image/png',
+                '.webp': 'image/webp',
+                '.bmp': 'image/bmp',
+                '.gif': 'image/gif'
+            }
+            mime_type = mime_type_map.get(file_extension, 'image/jpeg')
+
+            # Format as data URL
+            data_url = f"data:{mime_type};base64,{image_data}"
+
+            # Generate unique image ID
+            image_id = len([att for att in history['metadata'][key]["attachments"] if att.get("type") == "image"]) + 1
+
+            attachment = {
+                "name": filename,
+                "type": "image",
+                "image_data": data_url,
+                "image_id": image_id,
+                "file_path": str(path)  # For UI preview
+            }
+        elif file_extension == '.pdf':
             # Process PDF file
             content = extract_pdf_text(path)
-            file_type = "application/pdf"
+            attachment = {
+                "name": filename,
+                "type": "application/pdf",
+                "content": content,
+            }
         elif file_extension == '.docx':
             content = extract_docx_text(path)
-            file_type = "application/docx"
+            attachment = {
+                "name": filename,
+                "type": "application/docx",
+                "content": content,
+            }
         else:
             # Default handling for text files
             with open(path, 'r', encoding='utf-8') as f:
                 content = f.read()
-            file_type = "text/plain"
 
-        # Add attachment
-        attachment = {
-            "name": filename,
-            "type": file_type,
-            "content": content,
-        }
+            attachment = {
+                "name": filename,
+                "type": "text/plain",
+                "content": content,
+            }
 
         history['metadata'][key]["attachments"].append(attachment)
-        return content  # Return the content for reuse
+        return attachment  # Return the attachment for reuse
     except Exception as e:
         logger.error(f"Error processing attachment {filename}: {e}")
         return None
@@ -759,6 +812,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         for file_path in files:
             add_message_attachment(output, row_idx, file_path, is_user=True)
 
+        # Collect image attachments for ExLlamaV3
+        image_attachments = []
+        if 'metadata' in output:
+            user_key = f"user_{row_idx}"
+            if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
+                for attachment in output['metadata'][user_key]["attachments"]:
+                    if attachment.get("type") == "image":
+                        image_attachments.append(attachment)
+
+        # Add image attachments to state for the generation
+        if image_attachments:
+            state['image_attachments'] = image_attachments
+
         # Add web search results as attachments if enabled
         if state.get('enable_web_search', False):
             search_query = generate_search_query(text, state)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
new file mode 100644
index 00000000..c2532ec3
--- /dev/null
+++ b/modules/exllamav3.py
@@ -0,0 +1,313 @@
+import traceback
+from pathlib import Path
+from typing import Any, List, Tuple
+
+from exllamav3 import Cache, Config, Generator, Model, Tokenizer
+from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
+
+from extensions.openai.image_utils import (
+    convert_image_attachments_to_pil,
+    convert_openai_messages_to_images
+)
+from modules import shared
+from modules.logging_colors import logger
+
+try:
+    import flash_attn
+except Exception:
+    logger.warning('Failed to load flash-attention due to the following error:\n')
+    traceback.print_exc()
+
+
+class Exllamav3Model:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(cls, path_to_model):
+        path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
+
+        # Reset global MMTokenAllocator to prevent token ID corruption when switching models
+        from exllamav3.tokenizer.mm_embedding import (
+            FIRST_MM_EMBEDDING_INDEX,
+            global_allocator
+        )
+        global_allocator.next_token_index = FIRST_MM_EMBEDDING_INDEX
+        logger.info("Reset MMTokenAllocator for clean multimodal token allocation")
+
+        config = Config.from_directory(str(path_to_model))
+        model = Model.from_config(config)
+
+        # Calculate the closest multiple of 256 at or above the chosen value
+        max_tokens = shared.args.ctx_size
+        if max_tokens % 256 != 0:
+            adjusted_tokens = ((max_tokens // 256) + 1) * 256
+            logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}")
+            max_tokens = adjusted_tokens
+
+        # Parse cache type (ExLlamaV2 pattern)
+        cache_type = shared.args.cache_type.lower()
+        cache_kwargs = {}
+        if cache_type == 'fp16':
+            layer_type = CacheLayer_fp16
+        elif cache_type.startswith('q'):
+            layer_type = CacheLayer_quant
+            if '_' in cache_type:
+                # Different bits for k and v (e.g., q4_q8)
+                k_part, v_part = cache_type.split('_')
+                k_bits = int(k_part[1:])
+                v_bits = int(v_part[1:])
+            else:
+                # Same bits for k and v (e.g., q4)
+                k_bits = v_bits = int(cache_type[1:])
+
+            # Validate bit ranges
+            if not (2 <= k_bits <= 8 and 2 <= v_bits <= 8):
+                logger.warning(f"Invalid quantization bits: k_bits={k_bits}, v_bits={v_bits}. Must be between 2 and 8. Falling back to fp16.")
+                layer_type = CacheLayer_fp16
+            else:
+                cache_kwargs = {'k_bits': k_bits, 'v_bits': v_bits}
+        else:
+            logger.warning(f"Unrecognized cache type: {cache_type}. Falling back to fp16.")
+            layer_type = CacheLayer_fp16
+
+        cache = Cache(model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs)
+
+        load_params = {'progressbar': True}
+        if shared.args.gpu_split:
+            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+            load_params['use_per_device'] = split
+
+        model.load(**load_params)
+
+        tokenizer = Tokenizer.from_config(config)
+
+        # Load vision model component (ExLlamaV3 native)
+        vision_model = None
+        try:
+            logger.info("Loading vision model component...")
+            vision_model = Model.from_config(config, component="vision")
+            vision_model.load(progressbar=True)
+            logger.info("Vision model loaded successfully")
+        except Exception as e:
+            logger.warning(f"Vision model loading failed (multimodal disabled): {e}")
+
+        generator = Generator(
+            model=model,
+            cache=cache,
+            tokenizer=tokenizer,
+        )
+
+        result = cls()
+        result.model = model
+        result.cache = cache
+        result.tokenizer = tokenizer
+        result.generator = generator
+        result.config = config
+        result.max_tokens = max_tokens
+        result.vision_model = vision_model
+
+        return result
+
+    def is_multimodal(self) -> bool:
+        """Check if this model supports multimodal input."""
+        return hasattr(self, 'vision_model') and self.vision_model is not None
+
+    def _process_images_for_generation(self, prompt: str, state: dict) -> Tuple[str, List[Any]]:
+        """
+        Process all possible image inputs and return modified prompt + embeddings.
+        Returns: (processed_prompt, image_embeddings)
+        """
+        if not self.is_multimodal():
+            return prompt, []
+
+        # Collect images from various sources using shared utilities
+        pil_images = []
+
+        # From webui image_attachments (preferred format)
+        if 'image_attachments' in state and state['image_attachments']:
+            pil_images.extend(convert_image_attachments_to_pil(state['image_attachments']))
+
+        # From OpenAI API raw_images
+        elif 'raw_images' in state and state['raw_images']:
+            pil_images.extend(state['raw_images'])
+
+        # From OpenAI API messages format
+        elif 'messages' in state and state['messages']:
+            pil_images.extend(convert_openai_messages_to_images(state['messages']))
+
+        if not pil_images:
+            return prompt, []
+
+        # ExLlamaV3-specific: Generate embeddings
+        try:
+            # Use pre-computed embeddings if available (proper MMEmbedding lifetime)
+            if 'image_embeddings' in state and state['image_embeddings']:
+                # Use existing embeddings - this preserves MMEmbedding lifetime
+                image_embeddings = state['image_embeddings']
+            else:
+                # Do not reset the cache/allocator index; it causes token ID conflicts during generation.
+
+                logger.info(f"Processing {len(pil_images)} image(s) with ExLlamaV3 vision model")
+                image_embeddings = [
+                    self.vision_model.get_image_embeddings(tokenizer=self.tokenizer, image=img)
+                    for img in pil_images
+                ]
+
+            # ExLlamaV3-specific: Handle prompt processing with placeholders
+            placeholders = [ie.text_alias for ie in image_embeddings]
+
+            if '<__media__>' in prompt:
+                # Web chat: Replace <__media__> placeholders
+                for alias in placeholders:
+                    prompt = prompt.replace('<__media__>', alias, 1)
+                logger.info(f"Replaced {len(placeholders)} <__media__> placeholder(s)")
+            else:
+                # API: Prepend embedding aliases
+                combined_placeholders = "\n".join(placeholders)
+                prompt = combined_placeholders + "\n" + prompt
+                logger.info(f"Prepended {len(placeholders)} embedding(s) to prompt")
+
+            return prompt, image_embeddings
+
+        except Exception as e:
+            logger.error(f"Failed to process images: {e}")
+            return prompt, []
+
+    def generate_with_streaming(self, prompt, state):
+        """
+        Generate text with streaming using native ExLlamaV3 API
+        """
+        from exllamav3 import Job
+        from exllamav3.generator.sampler.presets import ComboSampler
+
+        # Process images and modify prompt (ExLlamaV3-specific)
+        prompt, image_embeddings = self._process_images_for_generation(prompt, state)
+
+        sampler = ComboSampler(
+            rep_p=state.get('repetition_penalty', 1.0),
+            freq_p=state.get('frequency_penalty', 0.0),
+            pres_p=state.get('presence_penalty', 0.0),
+            temperature=state.get('temperature', 0.7),
+            min_p=state.get('min_p', 0.0),
+            top_k=state.get('top_k', 0),
+            top_p=state.get('top_p', 1.0),
+        )
+
+        # Encode prompt with embeddings (ExLlamaV3-specific)
+        if image_embeddings:
+            input_ids = self.tokenizer.encode(
+                prompt,
+                encode_special_tokens=True,
+                embeddings=image_embeddings,
+            )
+        else:
+            input_ids = self.tokenizer.encode(prompt, encode_special_tokens=True)
+
+        # Get stop conditions from state (webui format) - keep as strings like ExLlamaV3 examples
+        stop_conditions = []
+        if 'stopping_strings' in state and state['stopping_strings']:
+            # Use strings directly (ExLlamaV3 handles the conversion internally)
+            stop_conditions.extend(state['stopping_strings'])
+
+        # Add EOS token ID as ExLlamaV3 examples do
+        if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
+            stop_conditions.append(self.tokenizer.eos_token_id)
+
+        job = Job(
+            input_ids=input_ids,
+            max_new_tokens=state.get('max_new_tokens', 500),
+            decode_special_tokens=True,
+            embeddings=image_embeddings if image_embeddings else None,
+            sampler=sampler,
+            stop_conditions=stop_conditions if stop_conditions else None,
+        )
+
+        # Stream generation
+        self.generator.enqueue(job)
+
+        response_text = ""
+        try:
+            while self.generator.num_remaining_jobs():
+                results = self.generator.iterate()
+                for result in results:
+                    if "eos" in result and result["eos"]:
+                        break
+
+                    chunk = result.get("text", "")
+                    if chunk:
+                        response_text += chunk
+                        yield response_text
+        finally:
+            # No cleanup needed. MMEmbedding lifetime is managed by Python.
+            # Cache and page table resets are unnecessary and can cause token ID conflicts.
+            pass
+
+    def generate(self, prompt, state):
+        """
+        Generate text using native ExLlamaV3 API (non-streaming)
+        """
+        output = self.generator.generate(
+            prompt=prompt,
+            max_new_tokens=state.get('max_new_tokens', 500),
+            temperature=state.get('temperature', 0.7),
+            top_p=state.get('top_p', 1.0),
+            top_k=state.get('top_k', 0),
+            repetition_penalty=state.get('repetition_penalty', 1.0),
+            frequency_penalty=state.get('frequency_penalty', 0.0),
+            presence_penalty=state.get('presence_penalty', 0.0),
+            min_p=state.get('min_p', 0.0),
+        )
+
+        return output
+
+    def encode(self, string, **kwargs):
+        return self.tokenizer.encode(string, **kwargs)
+
+    def decode(self, ids, **kwargs):
+        return self.tokenizer.decode(ids, **kwargs)
+
+    @property
+    def last_prompt_token_count(self):
+        # This would need to be tracked during generation
+        return 0
+
+    def unload(self):
+        logger.info("Unloading ExLlamaV3 model components...")
+
+        if hasattr(self, 'vision_model') and self.vision_model is not None:
+            try:
+                del self.vision_model
+            except Exception as e:
+                logger.warning(f"Error unloading vision model: {e}")
+            self.vision_model = None
+
+        if hasattr(self, 'model') and self.model is not None:
+            try:
+                self.model.unload()
+                del self.model
+            except Exception as e:
+                logger.warning(f"Error unloading main model: {e}")
+            self.model = None
+
+        if hasattr(self, 'cache') and self.cache is not None:
+            self.cache = None
+
+        if hasattr(self, 'generator') and self.generator is not None:
+            self.generator = None
+
+        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+            self.tokenizer = None
+
+        # Force GPU memory cleanup
+        import gc
+
+        import torch
+        gc.collect()
+
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+            torch.cuda.empty_cache()
+
+        logger.info("ExLlamaV3 model fully unloaded")
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 79237f7f..63a0cdd0 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -406,16 +406,27 @@ def format_message_attachments(history, role, index):
         for attachment in attachments:
             name = html.escape(attachment["name"])
 
-            # Make clickable if URL exists
-            if "url" in attachment:
-                name = f'<a href="{html.escape(attachment["url"])}" target="_blank" rel="noopener noreferrer">{name}</a>'
+            if attachment.get("type") == "image":
+                # Show image preview
+                file_path = attachment.get("file_path", "")
+                attachments_html += (
+                    f'<div class="attachment-box image-attachment">'
+                    f'<img src="file/{file_path}" alt="{name}" class="image-preview" />'
+                    f'<div class="attachment-name">{name}</div>'
+                    f'</div>'
+                )
+            else:
+                # Make clickable if URL exists (web search)
+                if "url" in attachment:
+                    name = f'<a href="{html.escape(attachment["url"])}" target="_blank" rel="noopener noreferrer">{name}</a>'
+
+                attachments_html += (
+                    f'<div class="attachment-box">'
+                    f'<div class="attachment-icon">{attachment_svg}</div>'
+                    f'<div class="attachment-name">{name}</div>'
+                    f'</div>'
+                )
 
-            attachments_html += (
-                f'<div class="attachment-box">'
-                f'<div class="attachment-icon">{attachment_svg}</div>'
-                f'<div class="attachment-name">{name}</div>'
-                f'</div>'
-            )
         attachments_html += '</div>'
         return attachments_html
 
diff --git a/modules/loaders.py b/modules/loaders.py
index 7546bc5b..e9437c16 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -55,6 +55,11 @@ loaders_and_params = OrderedDict({
         'trust_remote_code',
         'no_use_fast',
     ],
+    'ExLlamav3': [
+        'ctx_size',
+        'cache_type',
+        'gpu_split',
+    ],
     'ExLlamav2_HF': [
         'ctx_size',
         'cache_type',
@@ -248,6 +253,41 @@ loaders_samplers = {
         'grammar_string',
         'grammar_file_row',
     },
+    'ExLlamav3': {
+        'temperature',
+        'dynatemp_low',
+        'dynatemp_high',
+        'dynatemp_exponent',
+        'smoothing_factor',
+        'min_p',
+        'top_p',
+        'top_k',
+        'typical_p',
+        'xtc_threshold',
+        'xtc_probability',
+        'tfs',
+        'top_a',
+        'dry_multiplier',
+        'dry_allowed_length',
+        'dry_base',
+        'repetition_penalty',
+        'frequency_penalty',
+        'presence_penalty',
+        'repetition_penalty_range',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'dynamic_temperature',
+        'temperature_last',
+        'auto_max_new_tokens',
+        'ban_eos_token',
+        'add_bos_token',
+        'enable_thinking',
+        'skip_special_tokens',
+        'seed',
+        'custom_token_bans',
+        'dry_sequence_breakers',
+    },
     'ExLlamav2': {
         'temperature',
         'dynatemp_low',
diff --git a/modules/models.py b/modules/models.py
index c1e7fb56..cc500a40 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -19,6 +19,7 @@ def load_model(model_name, loader=None):
         'llama.cpp': llama_cpp_server_loader,
         'Transformers': transformers_loader,
         'ExLlamav3_HF': ExLlamav3_HF_loader,
+        'ExLlamav3': ExLlamav3_loader,
         'ExLlamav2_HF': ExLlamav2_HF_loader,
         'ExLlamav2': ExLlamav2_loader,
         'TensorRT-LLM': TensorRT_LLM_loader,
@@ -88,6 +89,14 @@ def ExLlamav3_HF_loader(model_name):
     return Exllamav3HF.from_pretrained(model_name)
 
 
+def ExLlamav3_loader(model_name):
+    from modules.exllamav3 import Exllamav3Model
+
+    model = Exllamav3Model.from_pretrained(model_name)
+    tokenizer = model.tokenizer
+    return model, tokenizer
+
+
 def ExLlamav2_HF_loader(model_name):
     from modules.exllamav2_hf import Exllamav2HF
 
@@ -116,7 +125,9 @@ def unload_model(keep_model_name=False):
         return
 
     is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
-    if shared.model.__class__.__name__ == 'Exllamav3HF':
+    if shared.args.loader in ['ExLlamav3_HF', 'ExLlamav3']:
+        shared.model.unload()
+    elif shared.args.loader in ['ExLlamav2_HF', 'ExLlamav2'] and hasattr(shared.model, 'unload'):
         shared.model.unload()
 
     shared.model = shared.tokenizer = None
diff --git a/modules/shared.py b/modules/shared.py
index ab5198d1..1de4306b 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -318,6 +318,8 @@ def fix_loader_name(name):
         return 'ExLlamav2_HF'
     elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']:
         return 'ExLlamav3_HF'
+    elif name in ['exllamav3']:
+        return 'ExLlamav3'
     elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
         return 'TensorRT-LLM'
 
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 8d1950b9..d6a87ce8 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -40,7 +40,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
             yield ''
             return
 
-        if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel']:
+        if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']:
             generate_func = generate_reply_custom
         else:
             generate_func = generate_reply_HF
@@ -128,9 +128,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
 
         from modules.torch_utils import get_device
 
-        if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel']:
+        if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']:
             input_ids = shared.tokenizer.encode(str(prompt))
-            if shared.model.__class__.__name__ != 'Exllamav2Model':
+            if shared.model.__class__.__name__ not in ['Exllamav2Model', 'Exllamav3Model']:
                 input_ids = np.array(input_ids).reshape(1, len(input_ids))
         else:
             input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
@@ -148,7 +148,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
         if truncation_length is not None:
             input_ids = input_ids[:, -truncation_length:]
 
-        if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
+        if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel'] or shared.args.cpu:
             return input_ids
         else:
             device = get_device()
@@ -295,6 +295,8 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
         _StopEverythingStoppingCriteria
     )
 
+    # Native ExLlamav3Model handles multimodal internally - no special routing needed
+
     if shared.args.loader == 'Transformers':
         clear_torch_cache()
 
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 1d85a398..3b922fb4 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -54,7 +54,7 @@ def create_ui():
                         gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
 
                     with gr.Column(scale=10, elem_id='chat-input-container'):
-                        shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
+                        shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf', 'image'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
                         shared.gradio['typing-dots'] = gr.HTML(value='<div class="typing"><span></span><span class="dot1"></span><span class="dot2"></span></div>', label='typing', elem_id='typing-container')
 
                     with gr.Column(scale=1, elem_id='generate-stop-container'):

From 6e9de75727ace45b3bf71ea3a98ef350b6d7414d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Aug 2025 19:35:09 -0700
Subject: [PATCH 03/51] Support loading chat templates from chat_template.json
 files

---
 modules/models_settings.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 4e53dc81..729d5dd1 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -139,16 +139,26 @@ def get_model_metadata(model):
         with open(jinja_path, 'r', encoding='utf-8') as f:
             template = f.read()
 
+    # 2. If no .jinja file, try chat_template.json
+    if template is None:
+        json_template_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.json'
+        if json_template_path.exists():
+            with open(json_template_path, 'r', encoding='utf-8') as f:
+                json_data = json.load(f)
+                if 'chat_template' in json_data:
+                    template = json_data['chat_template']
+
+    # 3. Fall back to tokenizer_config.json metadata
     if path.exists():
         metadata = json.loads(open(path, 'r', encoding='utf-8').read())
 
-        # 2. Only read from metadata if we haven't already loaded from .jinja
+        # Only read from metadata if we haven't already loaded from .jinja or .json
         if template is None and 'chat_template' in metadata:
             template = metadata['chat_template']
             if isinstance(template, list):
                 template = template[0]['template']
 
-        # 3. If a template was found from either source, process it
+        # 4. If a template was found from any source, process it
         if template:
             for k in ['eos_token', 'bos_token']:
                 if k in metadata:

From 8fcadff8d3120d1f3e844cd030d59a8c2b0b2dfd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Aug 2025 20:13:54 -0700
Subject: [PATCH 04/51] mtmd: Use the base64 attachment for the UI preview
 instead of the file

---
 modules/chat.py           | 1 -
 modules/html_generator.py | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 354ae46b..98800239 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -640,7 +640,6 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
                 "type": "image",
                 "image_data": data_url,
                 "image_id": image_id,
-                "file_path": str(path)  # For UI preview
             }
         elif file_extension == '.pdf':
             # Process PDF file
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 63a0cdd0..cb14a722 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -407,11 +407,10 @@ def format_message_attachments(history, role, index):
             name = html.escape(attachment["name"])
 
             if attachment.get("type") == "image":
-                # Show image preview
-                file_path = attachment.get("file_path", "")
+                image_data = attachment.get("image_data", "")
                 attachments_html += (
                     f'<div class="attachment-box image-attachment">'
-                    f'<img src="file/{file_path}" alt="{name}" class="image-preview" />'
+                    f'<img src="{image_data}" alt="{name}" class="image-preview" />'
                     f'<div class="attachment-name">{name}</div>'
                     f'</div>'
                 )

From 544c3a7c9f305b6a2141c3d02770250058d43322 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Aug 2025 21:15:53 -0700
Subject: [PATCH 05/51] Polish the new exllamav3 loader

---
 modules/exllamav3.py | 152 +++++++++++++++++++++++++++++--------------
 modules/loaders.py   |  21 +-----
 2 files changed, 104 insertions(+), 69 deletions(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index c2532ec3..295c2737 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -2,8 +2,22 @@ import traceback
 from pathlib import Path
 from typing import Any, List, Tuple
 
+import torch
 from exllamav3 import Cache, Config, Generator, Model, Tokenizer
 from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
+from exllamav3.generator import Job
+# Import the base sampler components directly from exllamav3
+from exllamav3.generator.sampler import (
+    CustomSampler,
+    SS_Argmax,
+    SS_MinP,
+    SS_PresFreqP,
+    SS_RepP,
+    SS_Sample,
+    SS_Temperature,
+    SS_TopK,
+    SS_TopP
+)
 
 from extensions.openai.image_utils import (
     convert_image_attachments_to_pil,
@@ -11,6 +25,7 @@ from extensions.openai.image_utils import (
 )
 from modules import shared
 from modules.logging_colors import logger
+from modules.text_generation import get_max_prompt_length
 
 try:
     import flash_attn
@@ -79,7 +94,6 @@ class Exllamav3Model:
             load_params['use_per_device'] = split
 
         model.load(**load_params)
-
         tokenizer = Tokenizer.from_config(config)
 
         # Load vision model component (ExLlamaV3 native)
@@ -127,11 +141,9 @@ class Exllamav3Model:
         # From webui image_attachments (preferred format)
         if 'image_attachments' in state and state['image_attachments']:
             pil_images.extend(convert_image_attachments_to_pil(state['image_attachments']))
-
         # From OpenAI API raw_images
         elif 'raw_images' in state and state['raw_images']:
             pil_images.extend(state['raw_images'])
-
         # From OpenAI API messages format
         elif 'messages' in state and state['messages']:
             pil_images.extend(convert_openai_messages_to_images(state['messages']))
@@ -147,7 +159,6 @@ class Exllamav3Model:
                 image_embeddings = state['image_embeddings']
             else:
                 # Do not reset the cache/allocator index; it causes token ID conflicts during generation.
-
                 logger.info(f"Processing {len(pil_images)} image(s) with ExLlamaV3 vision model")
                 image_embeddings = [
                     self.vision_model.get_image_embeddings(tokenizer=self.tokenizer, image=img)
@@ -178,46 +189,98 @@ class Exllamav3Model:
         """
         Generate text with streaming using native ExLlamaV3 API
         """
-        from exllamav3 import Job
-        from exllamav3.generator.sampler.presets import ComboSampler
-
         # Process images and modify prompt (ExLlamaV3-specific)
         prompt, image_embeddings = self._process_images_for_generation(prompt, state)
 
-        sampler = ComboSampler(
-            rep_p=state.get('repetition_penalty', 1.0),
-            freq_p=state.get('frequency_penalty', 0.0),
-            pres_p=state.get('presence_penalty', 0.0),
-            temperature=state.get('temperature', 0.7),
-            min_p=state.get('min_p', 0.0),
-            top_k=state.get('top_k', 0),
-            top_p=state.get('top_p', 1.0),
-        )
+        # -- Manually build and sort the sampler stack --
+        # Greedy decoding is a special case
+        if state['temperature'] == 0:
+            sampler = CustomSampler([SS_Argmax()])
+        else:
+            # 1. Create a list of all active, unordered samplers
+            unordered_samplers = []
+
+            # Penalties
+            penalty_range = state['repetition_penalty_range']
+            if penalty_range <= 0:
+                penalty_range = -1 # ExllamaV3 uses -1 for whole context
+            rep_decay = 0 # Not a configurable parameter
+
+            # Add penalty samplers if they are active
+            if state['repetition_penalty'] != 1.0:
+                 unordered_samplers.append(SS_RepP(state['repetition_penalty'], penalty_range, rep_decay))
+            if state['presence_penalty'] != 0.0 or state['frequency_penalty'] != 0.0:
+                 unordered_samplers.append(SS_PresFreqP(state['presence_penalty'], state['frequency_penalty'], penalty_range, rep_decay))
+
+            # Standard samplers
+            if state['top_k'] > 0:
+                unordered_samplers.append(SS_TopK(state['top_k']))
+            if state['top_p'] < 1.0:
+                unordered_samplers.append(SS_TopP(state['top_p']))
+            if state['min_p'] > 0.0:
+                unordered_samplers.append(SS_MinP(state['min_p']))
+
+            # Temperature
+            unordered_samplers.append(SS_Temperature(state['temperature']))
+
+            # 2. Define the mapping from class names to the priority list keys
+            class_name_to_nickname = {
+                'SS_RepP': 'repetition_penalty',
+                'SS_PresFreqP': 'presence_frequency_penalty',
+                'SS_TopK': 'top_k',
+                'SS_TopP': 'top_p',
+                'SS_MinP': 'min_p',
+                'SS_Temperature': 'temperature',
+            }
+
+            # 3. Get the priority list and handle temperature_last
+            default_priority = ['repetition_penalty', 'presence_frequency_penalty', 'top_k', 'top_p', 'min_p', 'temperature']
+            sampler_priority = state.get('sampler_priority', default_priority)
+
+            if state['temperature_last'] and 'temperature' in sampler_priority:
+                sampler_priority.append(sampler_priority.pop(sampler_priority.index('temperature')))
+
+            # 4. Sort the unordered list based on the priority list
+            def custom_sort_key(sampler_obj):
+                class_name = sampler_obj.__class__.__name__
+                nickname = class_name_to_nickname.get(class_name)
+                if nickname in sampler_priority:
+                    return sampler_priority.index(nickname)
+                return -1
+
+            ordered_samplers = sorted(unordered_samplers, key=custom_sort_key)
+
+            # 5. Add the final sampling stage and build the sampler
+            ordered_samplers.append(SS_Sample())
+            sampler = CustomSampler(ordered_samplers)
+        # -- End of sampler building --
 
         # Encode prompt with embeddings (ExLlamaV3-specific)
-        if image_embeddings:
-            input_ids = self.tokenizer.encode(
-                prompt,
-                encode_special_tokens=True,
-                embeddings=image_embeddings,
-            )
+        input_ids = self.tokenizer.encode(
+            prompt,
+            add_bos=state['add_bos_token'],
+            encode_special_tokens=True,
+            embeddings=image_embeddings,
+        )
+
+        input_ids = input_ids[:, -get_max_prompt_length(state):]
+
+        # Determine max_new_tokens
+        if state['auto_max_new_tokens']:
+            max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
         else:
-            input_ids = self.tokenizer.encode(prompt, encode_special_tokens=True)
+            max_new_tokens = state['max_new_tokens']
 
-        # Get stop conditions from state (webui format) - keep as strings like ExLlamaV3 examples
+        # Get stop conditions
         stop_conditions = []
-        if 'stopping_strings' in state and state['stopping_strings']:
-            # Use strings directly (ExLlamaV3 handles the conversion internally)
-            stop_conditions.extend(state['stopping_strings'])
-
-        # Add EOS token ID as ExLlamaV3 examples do
-        if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
-            stop_conditions.append(self.tokenizer.eos_token_id)
+        if not state['ban_eos_token']:
+            if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
+                stop_conditions.append(self.tokenizer.eos_token_id)
 
         job = Job(
             input_ids=input_ids,
-            max_new_tokens=state.get('max_new_tokens', 500),
-            decode_special_tokens=True,
+            max_new_tokens=max_new_tokens,
+            decode_special_tokens=not state['skip_special_tokens'],
             embeddings=image_embeddings if image_embeddings else None,
             sampler=sampler,
             stop_conditions=stop_conditions if stop_conditions else None,
@@ -244,25 +307,16 @@ class Exllamav3Model:
             pass
 
     def generate(self, prompt, state):
-        """
-        Generate text using native ExLlamaV3 API (non-streaming)
-        """
-        output = self.generator.generate(
-            prompt=prompt,
-            max_new_tokens=state.get('max_new_tokens', 500),
-            temperature=state.get('temperature', 0.7),
-            top_p=state.get('top_p', 1.0),
-            top_k=state.get('top_k', 0),
-            repetition_penalty=state.get('repetition_penalty', 1.0),
-            frequency_penalty=state.get('frequency_penalty', 0.0),
-            presence_penalty=state.get('presence_penalty', 0.0),
-            min_p=state.get('min_p', 0.0),
-        )
+        output = ""
+        for chunk in self.generate_with_streaming(prompt, state):
+            output = chunk
 
         return output
 
     def encode(self, string, **kwargs):
-        return self.tokenizer.encode(string, **kwargs)
+        # Default add_bos to True for consistency with exllamav2 behavior
+        add_bos = kwargs.pop('add_bos', True)
+        return self.tokenizer.encode(string, add_bos=add_bos, **kwargs)
 
     def decode(self, ids, **kwargs):
         return self.tokenizer.decode(ids, **kwargs)
@@ -301,8 +355,6 @@ class Exllamav3Model:
 
         # Force GPU memory cleanup
         import gc
-
-        import torch
         gc.collect()
 
         if torch.cuda.is_available():
diff --git a/modules/loaders.py b/modules/loaders.py
index e9437c16..151de990 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -255,38 +255,21 @@ loaders_samplers = {
     },
     'ExLlamav3': {
         'temperature',
-        'dynatemp_low',
-        'dynatemp_high',
-        'dynatemp_exponent',
-        'smoothing_factor',
         'min_p',
         'top_p',
         'top_k',
-        'typical_p',
-        'xtc_threshold',
-        'xtc_probability',
-        'tfs',
-        'top_a',
-        'dry_multiplier',
-        'dry_allowed_length',
-        'dry_base',
         'repetition_penalty',
         'frequency_penalty',
         'presence_penalty',
         'repetition_penalty_range',
-        'mirostat_mode',
-        'mirostat_tau',
-        'mirostat_eta',
-        'dynamic_temperature',
         'temperature_last',
+        'sampler_priority',
         'auto_max_new_tokens',
         'ban_eos_token',
         'add_bos_token',
         'enable_thinking',
-        'skip_special_tokens',
         'seed',
-        'custom_token_bans',
-        'dry_sequence_breakers',
+        'skip_special_tokens',
     },
     'ExLlamav2': {
         'temperature',

From 9e260332cc9da24a407bb59aadf0cf6a9cf0d88c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Aug 2025 21:22:47 -0700
Subject: [PATCH 06/51] Remove some unnecessary code

---
 modules/exllamav3.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 295c2737..d616d2f5 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -26,6 +26,7 @@ from extensions.openai.image_utils import (
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
+from modules.torch_utils import clear_torch_cache
 
 try:
     import flash_attn
@@ -342,6 +343,7 @@ class Exllamav3Model:
                 del self.model
             except Exception as e:
                 logger.warning(f"Error unloading main model: {e}")
+
             self.model = None
 
         if hasattr(self, 'cache') and self.cache is not None:
@@ -352,14 +354,3 @@ class Exllamav3Model:
 
         if hasattr(self, 'tokenizer') and self.tokenizer is not None:
             self.tokenizer = None
-
-        # Force GPU memory cleanup
-        import gc
-        gc.collect()
-
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-            torch.cuda.empty_cache()
-
-        logger.info("ExLlamaV3 model fully unloaded")

From 1168004067dbe37791e6911fc3ed3386d8131ce3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:01:55 -0700
Subject: [PATCH 07/51] Minor change

---
 modules/exllamav3.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index d616d2f5..f6c56cb0 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -6,7 +6,6 @@ import torch
 from exllamav3 import Cache, Config, Generator, Model, Tokenizer
 from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
 from exllamav3.generator import Job
-# Import the base sampler components directly from exllamav3
 from exllamav3.generator.sampler import (
     CustomSampler,
     SS_Argmax,

From 3f5ec9644f5aec2045126cdc5d962ee6f0b44c14 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:06:07 -0700
Subject: [PATCH 08/51] mtmd: Place the image <__media__> at the top of the
 prompt

---
 modules/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 98800239..0a03a084 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -289,7 +289,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
                 if image_refs or attachments_text:
                     enhanced_user_msg = user_msg
                     if image_refs:
-                        enhanced_user_msg += f" {image_refs}"
+                        enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}"
                     if attachments_text:
                         enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
 

From d9db8f63a719f799bac8f05ed567a1ba38041a72 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:25:42 -0700
Subject: [PATCH 09/51] mtmd: Simplifications

---
 extensions/openai/completions.py | 33 +++++++-------------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 3d389f0b..ff64527a 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -407,6 +407,10 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
     # Add messages to generate_params if present for multimodal processing
     if 'messages' in body:
         generate_params['messages'] = body['messages']
+        raw_images = convert_openai_messages_to_images(generate_params['messages'])
+        if raw_images:
+            logger.info(f"Found {len(raw_images)} image(s) in request.")
+            generate_params['raw_images'] = raw_images
 
     if not stream:
         prompt_arg = body[prompt_str]
@@ -423,7 +427,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
         total_prompt_token_count = 0
 
         for idx, prompt in enumerate(prompt_arg, start=0):
-            if isinstance(prompt[0], int):
+            if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], int):
                 # token lists
                 if requested_model == shared.model_name:
                     prompt = decode(prompt)[0]
@@ -438,19 +442,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
 
             # generate reply #######################################
             debug_msg({'prompt': prompt, 'generate_params': generate_params})
-
-            # Use multimodal generation if images are present
-            if 'messages' in generate_params:
-                raw_images = convert_openai_messages_to_images(generate_params['messages'])
-                if raw_images:
-                    logger.info(f"Using multimodal generation for {len(raw_images)} images")
-                    generate_params['raw_images'] = raw_images
-                    generator = shared.model.generate_with_streaming(prompt, generate_params)
-                else:
-                    generator = generate_reply(prompt, generate_params, is_chat=False)
-            else:
-                generator = generate_reply(prompt, generate_params, is_chat=False)
-
+            generator = generate_reply(prompt, generate_params, is_chat=False)
             answer = ''
 
             for a in generator:
@@ -523,18 +515,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
 
         # generate reply #######################################
         debug_msg({'prompt': prompt, 'generate_params': generate_params})
-        # Use multimodal generation if images are present
-        if 'messages' in generate_params:
-            raw_images = convert_openai_messages_to_images(generate_params['messages'])
-            if raw_images:
-                logger.info(f"Using multimodal generation for {len(raw_images)} images")
-                generate_params['raw_images'] = raw_images
-                generator = shared.model.generate_with_streaming(prompt, generate_params)
-            else:
-                generator = generate_reply(prompt, generate_params, is_chat=False)
-        else:
-            generator = generate_reply(prompt, generate_params, is_chat=False)
-
+        generator = generate_reply(prompt, generate_params, is_chat=False)
         answer = ''
         seen_content = ''
         completion_token_count = 0

From fa9be444fa0b3e18763b1cd3d0dd07ad565ac1bb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:26:59 -0700
Subject: [PATCH 10/51] Use ExLlamav3 instead of ExLlamav3_HF by default for
 EXL3 models

---
 modules/models_settings.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 729d5dd1..d3bf4a36 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -211,11 +211,11 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
     elif re.match(r'.*\.gguf', model_name.lower()):
         loader = 'llama.cpp'
     elif hf_quant_method == 'exl3':
-        loader = 'ExLlamav3_HF'
+        loader = 'ExLlamav3'
     elif hf_quant_method in ['exl2', 'gptq']:
         loader = 'ExLlamav2_HF'
     elif re.match(r'.*exl3', model_name.lower()):
-        loader = 'ExLlamav3_HF'
+        loader = 'ExLlamav3'
     elif re.match(r'.*exl2', model_name.lower()):
         loader = 'ExLlamav2_HF'
     else:

From f396b82a4f92f5823ed2a9bd1ff32d915da4cf9a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:31:36 -0700
Subject: [PATCH 11/51] mtmd: Better way to detect if an EXL3 model is
 multimodal

---
 modules/exllamav3.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index f6c56cb0..70f6c2f1 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -98,13 +98,16 @@ class Exllamav3Model:
 
         # Load vision model component (ExLlamaV3 native)
         vision_model = None
-        try:
-            logger.info("Loading vision model component...")
-            vision_model = Model.from_config(config, component="vision")
-            vision_model.load(progressbar=True)
-            logger.info("Vision model loaded successfully")
-        except Exception as e:
-            logger.warning(f"Vision model loading failed (multimodal disabled): {e}")
+        if "vision_config" in config.config_dict:
+            logger.info("Vision component detected in model config. Attempting to load...")
+            try:
+                vision_model = Model.from_config(config, component="vision")
+                vision_model.load(progressbar=True)
+                logger.info("Vision model loaded successfully.")
+            except Exception as e:
+                logger.warning(f"Vision model loading failed (multimodal disabled): {e}")
+        else:
+            logger.info("No vision component in model config. Skipping multimodal setup.")
 
         generator = Generator(
             model=model,

From 59c6138e989861020816041821369fd9cd6b0ffa Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:32:15 -0700
Subject: [PATCH 12/51] Remove a log message

---
 modules/exllamav3.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 70f6c2f1..bdb0c760 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -48,7 +48,6 @@ class Exllamav3Model:
             global_allocator
         )
         global_allocator.next_token_index = FIRST_MM_EMBEDDING_INDEX
-        logger.info("Reset MMTokenAllocator for clean multimodal token allocation")
 
         config = Config.from_directory(str(path_to_model))
         model = Model.from_config(config)

From 2fe79a93ccf21121db3dd076050df93f08c5bdb9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:50:24 -0700
Subject: [PATCH 13/51] mtmd: Handle another case after
 3f5ec9644f5aec2045126cdc5d962ee6f0b44c14

---
 modules/chat.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 0a03a084..b127b489 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -326,7 +326,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
                             attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
 
                 if image_refs or attachments_text:
-                    user_input = f"{user_input} {image_refs}"
+                    user_input = user_input
+                    if image_refs:
+                        user_input = f"{image_refs}\n\n{user_input}"
                     if attachments_text:
                         user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
 

From a6d6bee88cf52dc4bbfaeac91df5a951810ab0dd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 07:51:03 -0700
Subject: [PATCH 14/51] Change a comment

---
 modules/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index b127b489..639feebf 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -813,7 +813,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         for file_path in files:
             add_message_attachment(output, row_idx, file_path, is_user=True)
 
-        # Collect image attachments for ExLlamaV3
+        # Collect image attachments for multimodal generation
         image_attachments = []
         if 'metadata' in output:
             user_key = f"user_{row_idx}"

From d489eb589a698e37540861fe2951ef66efdb772d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 14:10:41 -0700
Subject: [PATCH 15/51] Attempt at fixing new exllamav3 loader undefined
 behavior when switching conversations

---
 modules/exllamav3.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index bdb0c760..e3a2d95a 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -304,9 +304,7 @@ class Exllamav3Model:
                         response_text += chunk
                         yield response_text
         finally:
-            # No cleanup needed. MMEmbedding lifetime is managed by Python.
-            # Cache and page table resets are unnecessary and can cause token ID conflicts.
-            pass
+            self.generator.clear_queue()
 
     def generate(self, prompt, state):
         output = ""

From a289a92b9408e2542632ffa600ef57c373200aec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 17:10:58 -0700
Subject: [PATCH 16/51] Fix exllamav3 token count

---
 modules/exllamav3.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index e3a2d95a..268a64ec 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -267,9 +267,11 @@ class Exllamav3Model:
 
         input_ids = input_ids[:, -get_max_prompt_length(state):]
 
+        self._last_prompt_token_count = input_ids.shape[-1]
+
         # Determine max_new_tokens
         if state['auto_max_new_tokens']:
-            max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
+            max_new_tokens = state['truncation_length'] - self._last_prompt_token_count
         else:
             max_new_tokens = state['max_new_tokens']
 
@@ -323,8 +325,7 @@ class Exllamav3Model:
 
     @property
     def last_prompt_token_count(self):
-        # This would need to be tracked during generation
-        return 0
+        return getattr(self, '_last_prompt_token_count', 0)
 
     def unload(self):
         logger.info("Unloading ExLlamaV3 model components...")

From eb16f6401794340c751e8105bc3837967b9e054e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 17:12:16 -0700
Subject: [PATCH 17/51] Update llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_cuda128.txt             | 4 ++--
 requirements/full/requirements_cuda128_noavx2.txt      | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 18 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index f17cae8a..323ef0f9 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 51f4571f..2a7c9361 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 37021c77..0106fbea 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index f54ae191..d5db4a1c 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index e495455b..694f1ff8 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 72847534..392637e2 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index ed641a24..88eaa930 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index d7fe735b..6accc2f0 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index cb71f74b..3025f092 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index d6bed576..7394bdcf 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 1f17dc50..a095a4c7 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 82254842..ea43e56e 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 986a3d49..79737728 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 833e923b..d39786bd 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 6a894d49..0b373fa9 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 0afb19c2..fe9dccac 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index a404f50c..b3cfd525 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 75176656..02aa03e3 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From d86b0ec0103f24f27329a600d1cbaf3a5ea4c517 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Sun, 10 Aug 2025 01:27:25 -0300
Subject: [PATCH 18/51] Add multimodal support (llama.cpp) (#7027)

---
 extensions/openai/image_utils.py            |  9 ++++
 modules/chat.py                             | 26 ++++++------
 modules/llama_cpp_server.py                 | 46 ++++++++++++++++++---
 modules/loaders.py                          |  2 +
 modules/shared.py                           |  1 +
 modules/ui.py                               |  1 +
 modules/ui_model_menu.py                    |  6 +++
 modules/utils.py                            | 13 ++++++
 user_data/mmproj/place-your-mmproj-here.txt |  0
 9 files changed, 86 insertions(+), 18 deletions(-)
 create mode 100644 user_data/mmproj/place-your-mmproj-here.txt

diff --git a/extensions/openai/image_utils.py b/extensions/openai/image_utils.py
index c54f0532..658f00d7 100644
--- a/extensions/openai/image_utils.py
+++ b/extensions/openai/image_utils.py
@@ -11,6 +11,15 @@ from PIL import Image
 from modules.logging_colors import logger
 
 
+def convert_pil_to_base64(image: Image.Image) -> str:
+    """Converts a PIL Image to a base64 encoded string."""
+    buffered = io.BytesIO()
+    # Save image to an in-memory bytes buffer in PNG format
+    image.save(buffered, format="PNG")
+    # Encode the bytes to a base64 string
+    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+
 def decode_base64_image(base64_string: str) -> Image.Image:
     """Decodes a base64 string to a PIL Image."""
     try:
diff --git a/modules/chat.py b/modules/chat.py
index 639feebf..696fa350 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -813,19 +813,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         for file_path in files:
             add_message_attachment(output, row_idx, file_path, is_user=True)
 
-        # Collect image attachments for multimodal generation
-        image_attachments = []
-        if 'metadata' in output:
-            user_key = f"user_{row_idx}"
-            if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
-                for attachment in output['metadata'][user_key]["attachments"]:
-                    if attachment.get("type") == "image":
-                        image_attachments.append(attachment)
-
-        # Add image attachments to state for the generation
-        if image_attachments:
-            state['image_attachments'] = image_attachments
-
         # Add web search results as attachments if enabled
         if state.get('enable_web_search', False):
             search_query = generate_search_query(text, state)
@@ -881,6 +868,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
                     'metadata': output['metadata']
                 }
 
+    # Collect image attachments for multimodal generation
+    image_attachments = []
+    if 'metadata' in output:
+        user_key = f"user_{row_idx}"
+        if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
+            for attachment in output['metadata'][user_key]["attachments"]:
+                if attachment.get("type") == "image":
+                    image_attachments.append(attachment)
+
+    # Add image attachments to state for the generation
+    if image_attachments:
+        state['image_attachments'] = image_attachments
+
     # Generate the prompt
     kwargs = {
         '_continue': _continue,
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index e64f1694..072ff83b 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -12,6 +12,10 @@ from pathlib import Path
 import llama_cpp_binaries
 import requests
 
+from extensions.openai.image_utils import (
+    convert_image_attachments_to_pil,
+    convert_pil_to_base64
+)
 from modules import shared
 from modules.logging_colors import logger
 
@@ -128,15 +132,40 @@ class LlamaServer:
         url = f"http://127.0.0.1:{self.port}/completion"
         payload = self.prepare_payload(state)
 
-        token_ids = self.encode(prompt, add_bos_token=state["add_bos_token"])
-        self.last_prompt_token_count = len(token_ids)
+        pil_images = []
+        # Check for images from the Web UI (image_attachments)
+        if 'image_attachments' in state and state['image_attachments']:
+            pil_images.extend(convert_image_attachments_to_pil(state['image_attachments']))
+        # Else, check for images from the API (raw_images)
+        elif 'raw_images' in state and state['raw_images']:
+            pil_images.extend(state.get('raw_images', []))
+
+        if pil_images:
+            # Multimodal case
+            IMAGE_TOKEN_COST_ESTIMATE = 600  # A safe, conservative estimate per image
+
+            base64_images = [convert_pil_to_base64(img) for img in pil_images]
+            multimodal_prompt_object = {
+                "prompt": prompt,
+                "multimodal_data": base64_images
+            }
+            payload["prompt"] = multimodal_prompt_object
+
+            # Calculate an estimated token count
+            text_tokens = self.encode(prompt, add_bos_token=state["add_bos_token"])
+            self.last_prompt_token_count = len(text_tokens) + (len(pil_images) * IMAGE_TOKEN_COST_ESTIMATE)
+        else:
+            # Text only case
+            token_ids = self.encode(prompt, add_bos_token=state["add_bos_token"])
+            self.last_prompt_token_count = len(token_ids)
+            payload["prompt"] = token_ids
+
         if state['auto_max_new_tokens']:
-            max_new_tokens = state['truncation_length'] - len(token_ids)
+            max_new_tokens = state['truncation_length'] - self.last_prompt_token_count
         else:
             max_new_tokens = state['max_new_tokens']
 
         payload.update({
-            "prompt": token_ids,
             "n_predict": max_new_tokens,
             "stream": True,
             "cache_prompt": True
@@ -144,7 +173,7 @@ class LlamaServer:
 
         if shared.args.verbose:
             logger.info("GENERATE_PARAMS=")
-            printable_payload = {k: v for k, v in payload.items() if k != "prompt"}
+            printable_payload = {k: (v if k != "prompt" else "[multimodal object]" if pil_images else v) for k, v in payload.items()}
             pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
             print()
 
@@ -295,6 +324,13 @@ class LlamaServer:
             cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
         if shared.args.rope_freq_base > 0:
             cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
+        if shared.args.mmproj not in [None, 'None']:
+            path = Path(shared.args.mmproj)
+            if not path.exists():
+                path = Path('user_data/mmproj') / shared.args.mmproj
+
+            if path.exists():
+                cmd += ["--mmproj", str(path)]
         if shared.args.model_draft not in [None, 'None']:
             path = Path(shared.args.model_draft)
             if not path.exists():
diff --git a/modules/loaders.py b/modules/loaders.py
index 151de990..feca9985 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -28,6 +28,8 @@ loaders_and_params = OrderedDict({
         'device_draft',
         'ctx_size_draft',
         'speculative_decoding_accordion',
+        'mmproj',
+        'mmproj_accordion',
         'vram_info',
     ],
     'Transformers': [
diff --git a/modules/shared.py b/modules/shared.py
index 1de4306b..e9d8a62f 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -85,6 +85,7 @@ group.add_argument('--no-kv-offload', action='store_true', help='Do not offload
 group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"')
 group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
+group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.')
 
 # Cache
 group = parser.add_argument_group('Context and cache')
diff --git a/modules/ui.py b/modules/ui.py
index e7805046..1171cd48 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -167,6 +167,7 @@ def list_model_elements():
         'gpu_layers_draft',
         'device_draft',
         'ctx_size_draft',
+        'mmproj',
     ]
 
     return elements
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 0ab67e7a..9fa8a4f4 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -59,6 +59,12 @@ def create_ui():
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
                             shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
+                            # Multimodal
+                            with gr.Accordion("Multimodal (vision)", open=False, elem_classes='tgw-accordion') as shared.gradio['mmproj_accordion']:
+                                with gr.Row():
+                                    shared.gradio['mmproj'] = gr.Dropdown(label="mmproj file", choices=utils.get_available_mmproj(), value=lambda: shared.args.mmproj or 'None', elem_classes='slim-dropdown', info='Select a file that matches your model. Must be placed in user_data/mmproj/', interactive=not mu)
+                                    ui.create_refresh_button(shared.gradio['mmproj'], lambda: None, lambda: {'choices': utils.get_available_mmproj()}, 'refresh-button', interactive=not mu)
+
                             # Speculative decoding
                             with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
                                 with gr.Row():
diff --git a/modules/utils.py b/modules/utils.py
index 117ad590..4927ef04 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -154,6 +154,19 @@ def get_available_ggufs():
     return sorted(model_list, key=natural_keys)
 
 
+def get_available_mmproj():
+    mmproj_dir = Path('user_data/mmproj')
+    if not mmproj_dir.exists():
+        return ['None']
+
+    mmproj_files = []
+    for item in mmproj_dir.iterdir():
+        if item.is_file() and item.suffix.lower() in ('.gguf', '.bin'):
+            mmproj_files.append(item.name)
+
+    return ['None'] + sorted(mmproj_files, key=natural_keys)
+
+
 def get_available_presets():
     return sorted(set((k.stem for k in Path('user_data/presets').glob('*.yaml'))), key=natural_keys)
 
diff --git a/user_data/mmproj/place-your-mmproj-here.txt b/user_data/mmproj/place-your-mmproj-here.txt
new file mode 100644
index 00000000..e69de29b

From c6b4d1e87f67dd990c66eaecb35c9cc70d0ae4e3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 21:33:12 -0700
Subject: [PATCH 19/51] Fix the exllamav2 loader ignoring add_bos

---
 modules/exllamav2.py | 3 ++-
 modules/exllamav3.py | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 6bb422ea..5d5c5b56 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -135,7 +135,8 @@ class Exllamav2Model:
         return result, result
 
     def encode(self, string, **kwargs):
-        return self.tokenizer.encode(string, add_bos=True, encode_special_tokens=True)
+        add_bos = kwargs.pop('add_bos', True)
+        return self.tokenizer.encode(string, add_bos=add_bos, encode_special_tokens=True, **kwargs)
 
     def decode(self, ids, **kwargs):
         if isinstance(ids, list):
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 268a64ec..9201801c 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -316,7 +316,6 @@ class Exllamav3Model:
         return output
 
     def encode(self, string, **kwargs):
-        # Default add_bos to True for consistency with exllamav2 behavior
         add_bos = kwargs.pop('add_bos', True)
         return self.tokenizer.encode(string, add_bos=add_bos, **kwargs)
 

From 2f90ac98807a4ffa6a761bbcef5cf81a9de568b8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 21:41:38 -0700
Subject: [PATCH 20/51] Move the new image_utils.py file to modules/

---
 extensions/openai/completions.py              | 2 +-
 modules/exllamav3.py                          | 4 ++--
 {extensions/openai => modules}/image_utils.py | 0
 modules/llama_cpp_server.py                   | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)
 rename {extensions/openai => modules}/image_utils.py (100%)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index ff64527a..f4944060 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -7,7 +7,6 @@ import tiktoken
 from pydantic import ValidationError
 
 from extensions.openai.errors import InvalidRequestError
-from extensions.openai.image_utils import convert_openai_messages_to_images
 from extensions.openai.typing import ToolDefinition
 from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
 from modules import shared
@@ -17,6 +16,7 @@ from modules.chat import (
     load_character_memoized,
     load_instruction_template_memoized
 )
+from modules.image_utils import convert_openai_messages_to_images
 from modules.logging_colors import logger
 from modules.presets import load_preset_memoized
 from modules.text_generation import decode, encode, generate_reply
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 9201801c..9d597ce7 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -18,11 +18,11 @@ from exllamav3.generator.sampler import (
     SS_TopP
 )
 
-from extensions.openai.image_utils import (
+from modules import shared
+from modules.image_utils import (
     convert_image_attachments_to_pil,
     convert_openai_messages_to_images
 )
-from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
 from modules.torch_utils import clear_torch_cache
diff --git a/extensions/openai/image_utils.py b/modules/image_utils.py
similarity index 100%
rename from extensions/openai/image_utils.py
rename to modules/image_utils.py
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 072ff83b..3e8127ab 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -12,11 +12,11 @@ from pathlib import Path
 import llama_cpp_binaries
 import requests
 
-from extensions.openai.image_utils import (
+from modules import shared
+from modules.image_utils import (
     convert_image_attachments_to_pil,
     convert_pil_to_base64
 )
-from modules import shared
 from modules.logging_colors import logger
 
 llamacpp_valid_cache_types = {"fp16", "q8_0", "q4_0"}

From 4663b1a56e99b8d637f9ac67c8b8d0e09d496ec7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 21:45:50 -0700
Subject: [PATCH 21/51] Update docs

---
 docs/12 - OpenAI API.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index b7b5fbc1..fc76cd8b 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -77,7 +77,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
   }'
 ```
 
-#### Multimodal support (ExLlamaV3)
+#### Multimodal support (llama.cpp and ExLlamaV3)
 
 ```shell
 curl http://127.0.0.1:5000/v1/chat/completions \

From 0ea62d88f60689b44dd4ee42ae9ba0ff871a29c2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 21:47:02 -0700
Subject: [PATCH 22/51] mtmd: Fix "continue" when an image is present

---
 modules/chat.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 696fa350..42bb58a5 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -868,6 +868,8 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
                     'metadata': output['metadata']
                 }
 
+    row_idx = len(output['internal']) - 1
+
     # Collect image attachments for multimodal generation
     image_attachments = []
     if 'metadata' in output:
@@ -895,7 +897,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         prompt = generate_chat_prompt(text, state, **kwargs)
 
     # Add timestamp for assistant's response at the start of generation
-    row_idx = len(output['internal']) - 1
     update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp(), model_name=shared.model_name)
 
     # Generate

From 1fb580785937ad42e8657a5fd894dfcd5a1fdeb3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Aug 2025 06:54:44 -0700
Subject: [PATCH 23/51] mtmd: Fix API text completion when no images are sent

---
 extensions/openai/completions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index f4944060..6f4dfc29 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -405,7 +405,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
     echo = body['echo']
 
     # Add messages to generate_params if present for multimodal processing
-    if 'messages' in body:
+    if body.get('messages'):
         generate_params['messages'] = body['messages']
         raw_images = convert_openai_messages_to_images(generate_params['messages'])
         if raw_images:

From 6fbf162d712cef876b128651fdebeb08a4f32538 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Aug 2025 07:21:55 -0700
Subject: [PATCH 24/51] Default max_tokens to 512 in the API instead of 16

---
 extensions/openai/typing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index e9f92da5..90366270 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -106,7 +106,7 @@ class CompletionRequestParams(BaseModel):
     frequency_penalty: float | None = 0
     logit_bias: dict | None = None
     logprobs: int | None = None
-    max_tokens: int | None = 16
+    max_tokens: int | None = 512
     n: int | None = Field(default=1, description="Unused parameter.")
     presence_penalty: float | None = 0
     stop: str | List[str] | None = None
@@ -232,7 +232,7 @@ class LogitsRequestParams(BaseModel):
     use_samplers: bool = False
     top_logits: int | None = 50
     frequency_penalty: float | None = 0
-    max_tokens: int | None = 16
+    max_tokens: int | None = 512
     presence_penalty: float | None = 0
     temperature: float | None = 1
     top_p: float | None = 1

From cc964ee579463d4c3acb35c188ee8eb38e23ce1a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Aug 2025 07:44:38 -0700
Subject: [PATCH 25/51] mtmd: Increase the size of the UI image preview

---
 css/main.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/main.css b/css/main.css
index de16d81d..062d3eb2 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1579,6 +1579,7 @@ strong {
 
 .image-attachment {
     flex-direction: column;
+    max-width: 314px;
 }
 
 .image-preview {

From 9ec310d858824d5f9186bca277a3ab77ac556b75 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Aug 2025 07:54:21 -0700
Subject: [PATCH 26/51] UI: Fix the color of italic text

---
 css/html_instruct_style.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 9831ee8f..3e5ebe67 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -13,7 +13,7 @@
     line-height: 28px !important;
 }
 
-.dark .chat .message-body :is(p, li, q, h1, h2, h3, h4, h5, h6) {
+.dark .chat .message-body :is(p, li, q, em, h1, h2, h3, h4, h5, h6) {
     color: #d1d5db !important;
 }
 

From c5340533c0b3a9edaea6c253f99250f09f2c26a5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Aug 2025 20:39:04 -0700
Subject: [PATCH 27/51] mtmd: Add another API example

---
 docs/12 - OpenAI API.md | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index fc76cd8b..5dc98a51 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -77,7 +77,9 @@ curl http://127.0.0.1:5000/v1/chat/completions \
   }'
 ```
 
-#### Multimodal support (llama.cpp and ExLlamaV3)
+#### Multimodal/vision (llama.cpp and ExLlamaV3)
+
+##### /v1/chat/completions (recommended!)
 
 ```shell
 curl http://127.0.0.1:5000/v1/chat/completions \
@@ -87,7 +89,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
       {
         "role": "user",
         "content": [
-          {"type": "text", "text": "What color is this image?"},
+          {"type": "text", "text": "Please describe what you see in this image."},
           {"type": "image_url", "image_url": {"url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"}}
         ]
       }
@@ -95,6 +97,38 @@ curl http://127.0.0.1:5000/v1/chat/completions \
   }'
 ```
 
+##### /v1/completions
+
+```shell
+curl http://127.0.0.1:5000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": "About image <__media__> and image <__media__>, what I can say is that the first one"
+          },
+          {
+            "type": "image_url",
+            "image_url": {
+              "url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"
+            }
+          },
+          {
+            "type": "image_url",
+            "image_url": {
+              "url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/strawberry.png?raw=true"
+            }
+          }
+        ]
+      }
+    ]
+  }'
+```
+
 #### SSE streaming
 
 ```shell

From 4d8dbbab648d14680741324d187bee23a8bea486 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:26:11 -0700
Subject: [PATCH 28/51] API: Fix sampler_priority usage for ExLlamaV3

---
 modules/exllamav3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 9d597ce7..8f686669 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -237,7 +237,7 @@ class Exllamav3Model:
 
             # 3. Get the priority list and handle temperature_last
             default_priority = ['repetition_penalty', 'presence_frequency_penalty', 'top_k', 'top_p', 'min_p', 'temperature']
-            sampler_priority = state.get('sampler_priority', default_priority)
+            sampler_priority = state.get('sampler_priority') or default_priority
 
             if state['temperature_last'] and 'temperature' in sampler_priority:
                 sampler_priority.append(sampler_priority.pop(sampler_priority.index('temperature')))

From 4809ddfeb85e8b8d28bb617366c86fd8037815ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:35:22 -0700
Subject: [PATCH 29/51] Exllamav3: small sampler fixes

---
 modules/exllamav3.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 8f686669..5c142ec2 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -6,7 +6,9 @@ import torch
 from exllamav3 import Cache, Config, Generator, Model, Tokenizer
 from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
 from exllamav3.generator import Job
-from exllamav3.generator.sampler import (
+
+from modules import shared
+from modules.exllamav3_custom_sampler import (
     CustomSampler,
     SS_Argmax,
     SS_MinP,
@@ -17,8 +19,6 @@ from exllamav3.generator.sampler import (
     SS_TopK,
     SS_TopP
 )
-
-from modules import shared
 from modules.image_utils import (
     convert_image_attachments_to_pil,
     convert_openai_messages_to_images
@@ -194,7 +194,6 @@ class Exllamav3Model:
         # Process images and modify prompt (ExLlamaV3-specific)
         prompt, image_embeddings = self._process_images_for_generation(prompt, state)
 
-        # -- Manually build and sort the sampler stack --
         # Greedy decoding is a special case
         if state['temperature'] == 0:
             sampler = CustomSampler([SS_Argmax()])
@@ -205,7 +204,7 @@ class Exllamav3Model:
             # Penalties
             penalty_range = state['repetition_penalty_range']
             if penalty_range <= 0:
-                penalty_range = -1 # ExllamaV3 uses -1 for whole context
+                penalty_range = int(10e7)  # Use large number for "full context"
             rep_decay = 0 # Not a configurable parameter
 
             # Add penalty samplers if they are active
@@ -222,7 +221,7 @@ class Exllamav3Model:
             if state['min_p'] > 0.0:
                 unordered_samplers.append(SS_MinP(state['min_p']))
 
-            # Temperature
+            # Temperature (SS_NoOp is returned if temp is 1.0)
             unordered_samplers.append(SS_Temperature(state['temperature']))
 
             # 2. Define the mapping from class names to the priority list keys
@@ -246,7 +245,7 @@ class Exllamav3Model:
             def custom_sort_key(sampler_obj):
                 class_name = sampler_obj.__class__.__name__
                 nickname = class_name_to_nickname.get(class_name)
-                if nickname in sampler_priority:
+                if nickname and nickname in sampler_priority:
                     return sampler_priority.index(nickname)
                 return -1
 
@@ -255,7 +254,6 @@ class Exllamav3Model:
             # 5. Add the final sampling stage and build the sampler
             ordered_samplers.append(SS_Sample())
             sampler = CustomSampler(ordered_samplers)
-        # -- End of sampler building --
 
         # Encode prompt with embeddings (ExLlamaV3-specific)
         input_ids = self.tokenizer.encode(

From 1cb800d3923093e102470c6dde4e4a8b451e0a33 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:37:10 -0700
Subject: [PATCH 30/51] Docs: small change

---
 docs/12 - OpenAI API.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 5dc98a51..fd3309c7 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -79,7 +79,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
 
 #### Multimodal/vision (llama.cpp and ExLlamaV3)
 
-##### /v1/chat/completions (recommended!)
+##### With /v1/chat/completions (recommended!)
 
 ```shell
 curl http://127.0.0.1:5000/v1/chat/completions \
@@ -97,7 +97,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
   }'
 ```
 
-##### /v1/completions
+##### With /v1/completions
 
 ```shell
 curl http://127.0.0.1:5000/v1/completions \

From 52d1cbbbe95bed853241ab422b0558ab029d7d08 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:38:39 -0700
Subject: [PATCH 31/51] Fix an import

---
 modules/exllamav3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 5c142ec2..3fabdb6b 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -8,7 +8,7 @@ from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
 from exllamav3.generator import Job
 
 from modules import shared
-from modules.exllamav3_custom_sampler import (
+from exllamav3.generator.sampler import (
     CustomSampler,
     SS_Argmax,
     SS_MinP,

From 38c0b4a1adc613e9e3a237835faa1d88632733ef Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:39:53 -0700
Subject: [PATCH 32/51] Default ctx-size to 8192 when not found in the metadata

---
 modules/models_settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index d3bf4a36..bf7b1cf9 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -15,7 +15,7 @@ from modules.logging_colors import logger
 def get_fallback_settings():
     return {
         'bf16': False,
-        'ctx_size': 2048,
+        'ctx_size': 8192,
         'rope_freq_base': 0,
         'compress_pos_emb': 1,
         'alpha_value': 1,

From b62c8845f34f3faac2481e368e6a4c67fd33fa59 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 08:22:17 -0700
Subject: [PATCH 33/51] mtmd: Fix /chat/completions for llama.cpp

---
 extensions/openai/completions.py | 18 +++++++++++++++---
 modules/chat.py                  | 21 +++++++++++----------
 modules/llama_cpp_server.py      |  8 ++++++--
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 6f4dfc29..c3037d0c 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -85,16 +85,28 @@ def process_parameters(body, is_legacy=False):
 
 
 def process_multimodal_content(content):
-    """Extract text from OpenAI multimodal format for non-multimodal models"""
+    """Extract text and add image placeholders from OpenAI multimodal format"""
     if isinstance(content, str):
         return content
 
     if isinstance(content, list):
         text_parts = []
+        image_placeholders = ""
         for item in content:
-            if isinstance(item, dict) and item.get('type') == 'text':
+            if not isinstance(item, dict):
+                continue
+
+            item_type = item.get('type', '')
+            if item_type == 'text':
                 text_parts.append(item.get('text', ''))
-        return ' '.join(text_parts) if text_parts else str(content)
+            elif item_type == 'image_url':
+                image_placeholders += "<__media__>"
+
+        final_text = ' '.join(text_parts)
+        if image_placeholders:
+            return f"{image_placeholders}\n\n{final_text}"
+        else:
+            return final_text
 
     return str(content)
 
diff --git a/modules/chat.py b/modules/chat.py
index 42bb58a5..7b1629dd 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -870,18 +870,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
     row_idx = len(output['internal']) - 1
 
-    # Collect image attachments for multimodal generation
-    image_attachments = []
+    # Collect image attachments for multimodal generation from the entire history
+    all_image_attachments = []
     if 'metadata' in output:
-        user_key = f"user_{row_idx}"
-        if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
-            for attachment in output['metadata'][user_key]["attachments"]:
-                if attachment.get("type") == "image":
-                    image_attachments.append(attachment)
+        for i in range(len(output['internal'])):
+            user_key = f"user_{i}"
+            if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
+                for attachment in output['metadata'][user_key]["attachments"]:
+                    if attachment.get("type") == "image":
+                        all_image_attachments.append(attachment)
 
-    # Add image attachments to state for the generation
-    if image_attachments:
-        state['image_attachments'] = image_attachments
+    # Add all collected image attachments to state for the generation
+    if all_image_attachments:
+        state['image_attachments'] = all_image_attachments
 
     # Generate the prompt
     kwargs = {
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 3e8127ab..63c8eda0 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -15,6 +15,7 @@ import requests
 from modules import shared
 from modules.image_utils import (
     convert_image_attachments_to_pil,
+    convert_openai_messages_to_images,
     convert_pil_to_base64
 )
 from modules.logging_colors import logger
@@ -133,10 +134,13 @@ class LlamaServer:
         payload = self.prepare_payload(state)
 
         pil_images = []
-        # Check for images from the Web UI (image_attachments)
+        # Source 1: Web UI (from chatbot_wrapper)
         if 'image_attachments' in state and state['image_attachments']:
             pil_images.extend(convert_image_attachments_to_pil(state['image_attachments']))
-        # Else, check for images from the API (raw_images)
+        # Source 2: Chat Completions API (/v1/chat/completions)
+        elif 'history' in state and state.get('history', {}).get('messages'):
+            pil_images.extend(convert_openai_messages_to_images(state['history']['messages']))
+        # Source 3: Legacy Completions API (/v1/completions)
         elif 'raw_images' in state and state['raw_images']:
             pil_images.extend(state.get('raw_images', []))
 

From b10d525bf7618c415c88937e26c1a2240c3b2fcf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:05:22 -0700
Subject: [PATCH 34/51] UI: Update a tooltip

---
 js/main.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/main.js b/js/main.js
index e0f9314d..66a344b3 100644
--- a/js/main.js
+++ b/js/main.js
@@ -977,7 +977,7 @@ if (document.readyState === "loading") {
 //------------------------------------------------
 
 // File upload button
-document.querySelector("#chat-input .upload-button").title = "Upload text files, PDFs, and DOCX documents";
+document.querySelector("#chat-input .upload-button").title = "Upload text files, PDFs, DOCX documents, and images";
 
 // Activate web search
 document.getElementById("web-search").title = "Search the internet with DuckDuckGo";

From 1ba1211ca027b887f160f6894a004b7d96ea0eee Mon Sep 17 00:00:00 2001
From: Mykeehu <halasim@hotmail.hu>
Date: Mon, 11 Aug 2025 21:13:56 +0200
Subject: [PATCH 35/51] Fix edit window and buttons in Messenger theme (#7100)

---
 css/chat_style-messenger.css | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css
index 65af5f7a..583703c0 100644
--- a/css/chat_style-messenger.css
+++ b/css/chat_style-messenger.css
@@ -99,3 +99,9 @@
 .message-body p em {
     color: rgb(110 110 110) !important;
 }
+.editing-textarea {
+    width: max(30rem) !important;
+}
+.circle-you + .text .edit-control-button, .circle-you + .text .editing-textarea {
+    color: #000 !important;
+}

From 999471256c0626bb29e9caa65bbf96b8d2cb52d6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:32:17 -0700
Subject: [PATCH 36/51] Lint

---
 modules/exllamav2.py |  2 +-
 modules/exllamav3.py | 11 ++++-------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 5d5c5b56..3b3233d2 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -3,6 +3,7 @@ import traceback
 from pathlib import Path
 
 import torch
+
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Cache,
@@ -15,7 +16,6 @@ from exllamav2 import (
     ExLlamaV2Tokenizer
 )
 from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
-
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 3fabdb6b..980230f8 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -2,12 +2,9 @@ import traceback
 from pathlib import Path
 from typing import Any, List, Tuple
 
-import torch
 from exllamav3 import Cache, Config, Generator, Model, Tokenizer
 from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
 from exllamav3.generator import Job
-
-from modules import shared
 from exllamav3.generator.sampler import (
     CustomSampler,
     SS_Argmax,
@@ -19,13 +16,13 @@ from exllamav3.generator.sampler import (
     SS_TopK,
     SS_TopP
 )
+from modules import shared
 from modules.image_utils import (
     convert_image_attachments_to_pil,
     convert_openai_messages_to_images
 )
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
-from modules.torch_utils import clear_torch_cache
 
 try:
     import flash_attn
@@ -205,13 +202,13 @@ class Exllamav3Model:
             penalty_range = state['repetition_penalty_range']
             if penalty_range <= 0:
                 penalty_range = int(10e7)  # Use large number for "full context"
-            rep_decay = 0 # Not a configurable parameter
+            rep_decay = 0  # Not a configurable parameter
 
             # Add penalty samplers if they are active
             if state['repetition_penalty'] != 1.0:
-                 unordered_samplers.append(SS_RepP(state['repetition_penalty'], penalty_range, rep_decay))
+                unordered_samplers.append(SS_RepP(state['repetition_penalty'], penalty_range, rep_decay))
             if state['presence_penalty'] != 0.0 or state['frequency_penalty'] != 0.0:
-                 unordered_samplers.append(SS_PresFreqP(state['presence_penalty'], state['frequency_penalty'], penalty_range, rep_decay))
+                unordered_samplers.append(SS_PresFreqP(state['presence_penalty'], state['frequency_penalty'], penalty_range, rep_decay))
 
             # Standard samplers
             if state['top_k'] > 0:

From a78ca6ffcdf0c53efdce8bfa6b37825590f5ae6e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:33:38 -0700
Subject: [PATCH 37/51] Remove a comment

---
 modules/text_generation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index d6a87ce8..27c5de7d 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -295,8 +295,6 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
         _StopEverythingStoppingCriteria
     )
 
-    # Native ExLlamav3Model handles multimodal internally - no special routing needed
-
     if shared.args.loader == 'Transformers':
         clear_torch_cache()
 

From 765af1ba1736b209427232d5bec1b2e55b099e1b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:39:18 -0700
Subject: [PATCH 38/51] API: Improve a validation

---
 extensions/openai/typing.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 90366270..56d91582 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -2,7 +2,7 @@ import json
 import time
 from typing import Dict, List, Optional
 
-from pydantic import BaseModel, Field, field_validator, validator
+from pydantic import BaseModel, Field, model_validator, validator
 
 
 class GenerationOptions(BaseModel):
@@ -116,16 +116,11 @@ class CompletionRequestParams(BaseModel):
     top_p: float | None = 1
     user: str | None = Field(default=None, description="Unused parameter.")
 
-    @field_validator('prompt', 'messages')
-    @classmethod
-    def validate_prompt_or_messages(cls, v, info):
-        """Ensure either 'prompt' or 'messages' is provided for completions."""
-        if info.field_name == 'prompt':  # If we're validating 'prompt', check if neither prompt nor messages will be set
-            messages = info.data.get('messages')
-            if v is None and messages is None:
-                raise ValueError("Either 'prompt' or 'messages' must be provided")
-
-        return v
+    @model_validator(mode='after')
+    def validate_prompt_or_messages(self):
+        if self.prompt is None and self.messages is None:
+            raise ValueError("Either 'prompt' or 'messages' must be provided")
+        return self
 
 
 class CompletionRequest(GenerationOptions, CompletionRequestParams):

From 1e3c4e8bdbc3e8d313bfab016bc6f1853c4ad4b7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 14:40:59 -0700
Subject: [PATCH 39/51] Update llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_cuda128.txt             | 4 ++--
 requirements/full/requirements_cuda128_noavx2.txt      | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 18 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 323ef0f9..789539fc 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 2a7c9361..d7922478 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 0106fbea..2a3337a3 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index d5db4a1c..7287497d 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 694f1ff8..48ebe381 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 392637e2..ccf80d06 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 88eaa930..e819dd04 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index 6accc2f0..8b9c882c 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index 3025f092..ce81c5ff 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 7394bdcf..6233b84a 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index a095a4c7..e3a863ec 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index ea43e56e..26f813d2 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 79737728..4de1159d 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index d39786bd..fded9898 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 0b373fa9..013364ff 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index fe9dccac..85e95eb3 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index b3cfd525..945dcf49 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 02aa03e3..bf1eff03 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 0e88a621fd96bc75b908d078972ab8117e957f55 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 15:16:03 -0700
Subject: [PATCH 40/51] UI: Better organize the right sidebar

---
 modules/ui_chat.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 3b922fb4..94c980bb 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -78,12 +78,19 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
 
+                gr.HTML("<div style='margin: 0; border-bottom: 1px solid rgba(255,255,255,0.1);'></div>")
+
                 shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
                 shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.')
+
+                gr.HTML("<div style='margin: 0; border-bottom: 1px solid rgba(255,255,255,0.1);'></div>")
+
                 shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search')
                 with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
                     shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)
 
+                gr.HTML("<div style='margin: 0; border-bottom: 1px solid rgba(255,255,255,0.1);'></div>")
+
                 with gr.Row():
                     shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
 
@@ -93,6 +100,8 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
 
+                gr.HTML("<div style='margin: 0; border-bottom: 1px solid rgba(255,255,255,0.1);'></div>")
+
                 with gr.Row():
                     shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm')
 

From 0e3def449a8bf71ab40c052e4206f612aeba0a60 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 15:17:25 -0700
Subject: [PATCH 41/51] llama.cpp: --swa-full to llama-server when
 streaming-llm is checked

---
 modules/llama_cpp_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 63c8eda0..58534f26 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -356,6 +356,7 @@ class LlamaServer:
                 cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
         if shared.args.streaming_llm:
             cmd += ["--cache-reuse", "1"]
+            cmd += ["--swa-full"]
         if shared.args.extra_flags:
             # Clean up the input
             extra_flags = shared.args.extra_flags.strip()

From c47e6deda279f27c7bff1a31351e72c0d5025052 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 16:20:20 -0700
Subject: [PATCH 42/51] Update README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 907d8c38..6e59f7da 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
 - 100% offline and private, with zero telemetry, external resources, or remote update requests.
 - **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
+- **Vision (multimodal models)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal%E2Tutorial)).
 - **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
 - Aesthetic UI with dark and light themes.
 - Syntax highlighting for code blocks and LaTeX rendering for mathematical expressions.

From e6447cd24acbde845dbb4aa27acfd4c17b5c849c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 17:42:35 -0700
Subject: [PATCH 43/51] mtmd: Update the llama-server request

---
 modules/llama_cpp_server.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 58534f26..e82edb90 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -149,11 +149,10 @@ class LlamaServer:
             IMAGE_TOKEN_COST_ESTIMATE = 600  # A safe, conservative estimate per image
 
             base64_images = [convert_pil_to_base64(img) for img in pil_images]
-            multimodal_prompt_object = {
-                "prompt": prompt,
+            payload["prompt"] = {
+                "prompt_string": prompt,
                 "multimodal_data": base64_images
             }
-            payload["prompt"] = multimodal_prompt_object
 
             # Calculate an estimated token count
             text_tokens = self.encode(prompt, add_bos_token=state["add_bos_token"])

From d8fcc71616307a8ecacea93b7bdfa1117a23e1fe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Aug 2025 18:02:33 -0700
Subject: [PATCH 44/51] mtmd: Fail early if images are provided but the model
 doesn't support them (llama.cpp)

---
 modules/llama_cpp_server.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index e82edb90..51dacb84 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -34,6 +34,7 @@ class LlamaServer:
         self.process = None
         self.session = requests.Session()
         self.vocabulary_size = None
+        self.has_multimodal = False
         self.bos_token = "<s>"
         self.last_prompt_token_count = 0
 
@@ -144,6 +145,10 @@ class LlamaServer:
         elif 'raw_images' in state and state['raw_images']:
             pil_images.extend(state.get('raw_images', []))
 
+        # Fail early if images are provided but the model doesn't support them
+        if pil_images and not self.has_multimodal:
+            raise RuntimeError("The loaded llama.cpp model does not support multimodal requests. You must load a vision model and provide an mmproj file.")
+
         if pil_images:
             # Multimodal case
             IMAGE_TOKEN_COST_ESTIMATE = 600  # A safe, conservative estimate per image
@@ -261,8 +266,8 @@ class LlamaServer:
         else:
             raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
 
-    def _get_vocabulary_size(self):
-        """Get and store the model's maximum context length."""
+    def _get_model_properties(self):
+        """Get and store the model's properties, including vocab size and multimodal capability."""
         url = f"http://127.0.0.1:{self.port}/v1/models"
         response = self.session.get(url).json()
 
@@ -271,6 +276,10 @@ class LlamaServer:
             if "meta" in model_info and "n_vocab" in model_info["meta"]:
                 self.vocabulary_size = model_info["meta"]["n_vocab"]
 
+            # Check for multimodal capability
+            if "capabilities" in model_info and "multimodal" in model_info["capabilities"]:
+                self.has_multimodal = True
+
     def _get_bos_token(self):
         """Get and store the model's BOS token."""
         url = f"http://127.0.0.1:{self.port}/props"
@@ -421,7 +430,7 @@ class LlamaServer:
             time.sleep(1)
 
         # Server is now healthy, get model info
-        self._get_vocabulary_size()
+        self._get_model_properties()
         self._get_bos_token()
         return self.port
 

From 0882970a9445badcd953f27e4e10ecf869c103a5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 07:00:24 -0700
Subject: [PATCH 45/51] Update llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_cuda128.txt             | 4 ++--
 requirements/full/requirements_cuda128_noavx2.txt      | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 18 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 789539fc..eb7742b1 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index d7922478..47bcb60a 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 2a3337a3..6958ce37 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 7287497d..0890b2a5 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 48ebe381..da3010c6 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index ccf80d06..3a9a953b 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index e819dd04..a3e176d3 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index 8b9c882c..807d0a21 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index ce81c5ff..41e96574 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 6233b84a..72ba7103 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index e3a863ec..0c7f1d29 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 26f813d2..09f1c502 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 4de1159d..75296cb4 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index fded9898..ff3d7cb1 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 013364ff..97414bde 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 85e95eb3..7f543205 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 945dcf49..c1764ead 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index bf1eff03..142b67ec 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 2238302b496a4145ee98e0eab0bf3d9f19a9c83b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 08:50:45 -0700
Subject: [PATCH 46/51] ExLlamaV3: Add speculative decoding

---
 modules/exllamav3.py | 58 ++++++++++++++++++++++++++++++++++++++++++++
 modules/loaders.py   |  4 +++
 2 files changed, 62 insertions(+)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 980230f8..7fc6c5b1 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -85,6 +85,7 @@ class Exllamav3Model:
         cache = Cache(model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs)
 
         load_params = {'progressbar': True}
+        split = None
         if shared.args.gpu_split:
             split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
             load_params['use_per_device'] = split
@@ -92,6 +93,45 @@ class Exllamav3Model:
         model.load(**load_params)
         tokenizer = Tokenizer.from_config(config)
 
+        # Initialize draft model for speculative decoding
+        draft_model = None
+        draft_cache = None
+        if shared.args.model_draft and shared.args.model_draft.lower() not in ["", "none"]:
+            logger.info(f"Loading draft model for speculative decoding: {shared.args.model_draft}")
+
+            draft_path = Path(shared.args.model_draft)
+            if not draft_path.is_dir():
+                draft_path = Path(f'{shared.args.model_dir}') / Path(shared.args.model_draft)
+
+            if not draft_path.is_dir():
+                logger.warning(f"Draft model not found at {draft_path}, speculative decoding disabled.")
+            else:
+                draft_config = Config.from_directory(str(draft_path))
+
+                # Set context size for draft model with 256-multiple validation
+                if shared.args.ctx_size_draft > 0:
+                    draft_max_tokens = shared.args.ctx_size_draft
+                else:
+                    draft_max_tokens = shared.args.ctx_size
+
+                # Validate draft model context size is a multiple of 256
+                if draft_max_tokens % 256 != 0:
+                    adjusted_draft_tokens = ((draft_max_tokens // 256) + 1) * 256
+                    logger.warning(f"Draft model max_num_tokens must be a multiple of 256. Adjusting from {draft_max_tokens} to {adjusted_draft_tokens}")
+                    draft_max_tokens = adjusted_draft_tokens
+
+                draft_config.max_seq_len = draft_max_tokens
+
+                draft_model = Model.from_config(draft_config)
+                draft_cache = Cache(draft_model, max_num_tokens=draft_max_tokens, layer_type=layer_type, **cache_kwargs)
+
+                draft_load_params = {'progressbar': True}
+                if split:
+                    draft_load_params['use_per_device'] = split
+
+                draft_model.load(**draft_load_params)
+                logger.info(f"Draft model loaded successfully. Max speculative tokens: {shared.args.draft_max}")
+
         # Load vision model component (ExLlamaV3 native)
         vision_model = None
         if "vision_config" in config.config_dict:
@@ -109,6 +149,9 @@ class Exllamav3Model:
             model=model,
             cache=cache,
             tokenizer=tokenizer,
+            draft_model=draft_model,
+            draft_cache=draft_cache,
+            num_speculative_tokens=shared.args.draft_max if draft_model is not None else 0,
         )
 
         result = cls()
@@ -119,6 +162,8 @@ class Exllamav3Model:
         result.config = config
         result.max_tokens = max_tokens
         result.vision_model = vision_model
+        result.draft_model = draft_model
+        result.draft_cache = draft_cache
 
         return result
 
@@ -289,6 +334,7 @@ class Exllamav3Model:
         self.generator.enqueue(job)
 
         response_text = ""
+
         try:
             while self.generator.num_remaining_jobs():
                 results = self.generator.iterate()
@@ -300,6 +346,7 @@ class Exllamav3Model:
                     if chunk:
                         response_text += chunk
                         yield response_text
+
         finally:
             self.generator.clear_queue()
 
@@ -331,6 +378,17 @@ class Exllamav3Model:
                 logger.warning(f"Error unloading vision model: {e}")
             self.vision_model = None
 
+        if hasattr(self, 'draft_model') and self.draft_model is not None:
+            try:
+                self.draft_model.unload()
+                del self.draft_model
+            except Exception as e:
+                logger.warning(f"Error unloading draft model: {e}")
+            self.draft_model = None
+
+        if hasattr(self, 'draft_cache') and self.draft_cache is not None:
+            self.draft_cache = None
+
         if hasattr(self, 'model') and self.model is not None:
             try:
                 self.model.unload()
diff --git a/modules/loaders.py b/modules/loaders.py
index feca9985..8b7e6cce 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -61,6 +61,10 @@ loaders_and_params = OrderedDict({
         'ctx_size',
         'cache_type',
         'gpu_split',
+        'model_draft',
+        'draft_max',
+        'ctx_size_draft',
+        'speculative_decoding_accordion',
     ],
     'ExLlamav2_HF': [
         'ctx_size',

From 2f6a629393afdb33e7fd355be10f6c72185412af Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 08:51:01 -0700
Subject: [PATCH 47/51] UI: Minor improvement after
 0e88a621fd96bc75b908d078972ab8117e957f55

---
 js/main.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/main.js b/js/main.js
index 66a344b3..4b4b14c2 100644
--- a/js/main.js
+++ b/js/main.js
@@ -583,7 +583,7 @@ function moveToChatTab() {
 
   const chatControlsFirstChild = document.querySelector("#chat-controls").firstElementChild;
   const newParent = chatControlsFirstChild;
-  let newPosition = newParent.children.length - 2;
+  let newPosition = newParent.children.length - 3;
 
   newParent.insertBefore(grandParent, newParent.children[newPosition]);
   document.getElementById("save-character").style.display = "none";

From 8d7b88106a34102863a491a9c8848871c5118a85 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:20:16 -0700
Subject: [PATCH 48/51] Revert "mtmd: Fail early if images are provided but the
 model doesn't support them (llama.cpp)"

This reverts commit d8fcc71616307a8ecacea93b7bdfa1117a23e1fe.
---
 modules/llama_cpp_server.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 51dacb84..e82edb90 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -34,7 +34,6 @@ class LlamaServer:
         self.process = None
         self.session = requests.Session()
         self.vocabulary_size = None
-        self.has_multimodal = False
         self.bos_token = "<s>"
         self.last_prompt_token_count = 0
 
@@ -145,10 +144,6 @@ class LlamaServer:
         elif 'raw_images' in state and state['raw_images']:
             pil_images.extend(state.get('raw_images', []))
 
-        # Fail early if images are provided but the model doesn't support them
-        if pil_images and not self.has_multimodal:
-            raise RuntimeError("The loaded llama.cpp model does not support multimodal requests. You must load a vision model and provide an mmproj file.")
-
         if pil_images:
             # Multimodal case
             IMAGE_TOKEN_COST_ESTIMATE = 600  # A safe, conservative estimate per image
@@ -266,8 +261,8 @@ class LlamaServer:
         else:
             raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
 
-    def _get_model_properties(self):
-        """Get and store the model's properties, including vocab size and multimodal capability."""
+    def _get_vocabulary_size(self):
+        """Get and store the model's maximum context length."""
         url = f"http://127.0.0.1:{self.port}/v1/models"
         response = self.session.get(url).json()
 
@@ -276,10 +271,6 @@ class LlamaServer:
             if "meta" in model_info and "n_vocab" in model_info["meta"]:
                 self.vocabulary_size = model_info["meta"]["n_vocab"]
 
-            # Check for multimodal capability
-            if "capabilities" in model_info and "multimodal" in model_info["capabilities"]:
-                self.has_multimodal = True
-
     def _get_bos_token(self):
         """Get and store the model's BOS token."""
         url = f"http://127.0.0.1:{self.port}/props"
@@ -430,7 +421,7 @@ class LlamaServer:
             time.sleep(1)
 
         # Server is now healthy, get model info
-        self._get_model_properties()
+        self._get_vocabulary_size()
         self._get_bos_token()
         return self.port
 

From 7301452b4183efab97de71dae27486874a3d73f6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:23:24 -0700
Subject: [PATCH 49/51] UI: Minor info message change

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 9fa8a4f4..6972a17e 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -42,7 +42,7 @@ def create_ui():
                     with gr.Row():
                         with gr.Column():
                             shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
-                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')

From 2f979ce2942efc82ad90dfc28c7407c473da5169 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:33:49 -0700
Subject: [PATCH 50/51] docs: Add a multimodal tutorial

---
 docs/Multimodal Tutorial.md | 66 +++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 docs/Multimodal Tutorial.md

diff --git a/docs/Multimodal Tutorial.md b/docs/Multimodal Tutorial.md
new file mode 100644
index 00000000..a30889f7
--- /dev/null
+++ b/docs/Multimodal Tutorial.md	
@@ -0,0 +1,66 @@
+## Getting started
+
+### 1. Find a multimodal model
+
+GGUF models with vision capabilities are uploaded along a `mmproj` file to Hugging Face.
+
+For instance, [unsloth/gemma-3-4b-it-GGUF](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/tree/main) has this:
+
+<img width="414" height="270" alt="print1" src="https://github.com/user-attachments/assets/ac5aeb61-f6a2-491e-a1f0-47d6e27ea286" />
+
+### 2. Download the model to `user_data/models`
+
+As an example, download
+
+https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/resolve/main/gemma-3-4b-it-Q4_K_S.gguf?download=true
+
+to your `text-generation-webui/user_data/models` folder.
+
+### 3. Download the associated mmproj file to `user_data/mmproj`
+
+Then download
+
+https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/resolve/main/mmproj-F16.gguf?download=true
+
+to your `text-generation-webui/user_data/mmproj` folder. Name it `mmproj-gemma-3-4b-it-F16.gguf` to give it a recognizable name.
+
+### 4. Load the model
+
+1. Launch the web UI
+2. Navigate to the Model tab
+3. Select the GGUF model in the Model dropdown:
+
+<img width="545" height="92" alt="print2" src="https://github.com/user-attachments/assets/3f920f50-e6c3-4768-91e2-20828dd63a1c" />
+
+4. Select the mmproj file in the Multimodal (vision) menu:
+
+<img width="454" height="172" alt="print3" src="https://github.com/user-attachments/assets/a657e20f-0ceb-4d71-9fe4-2b78571d20a6" />
+
+5. Click "Load"
+
+### 5. Send a message with an image
+
+Select your image by clicking on the 📎 icon and send your message:
+
+<img width="368" height="135" alt="print5" src="https://github.com/user-attachments/assets/6175ec9f-04f4-4dba-9382-4ac80d5b0b1f" />
+
+The model will reply with great understanding of the image contents:
+
+<img width="809" height="884" alt="print6" src="https://github.com/user-attachments/assets/be4a8f4d-619d-49e6-86f5-012d89f8db8d" />
+
+## Multimodal with ExLlamaV3
+
+Multimodal also works with the ExLlamaV3 loader (the non-HF one).
+
+No additional files are necessary, just load a multimodal EXL3 model and send an image.
+
+Examples of models that you can use:
+
+- https://huggingface.co/turboderp/gemma-3-27b-it-exl3
+- https://huggingface.co/turboderp/Mistral-Small-3.1-24B-Instruct-2503-exl3
+
+## Multimodal API examples
+
+In the page below you can find some ready-to-use examples:
+
+[Multimodal/vision (llama.cpp and ExLlamaV3)](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#multimodalvision-llamacpp-and-exllamav3)

From 41b95e9ec3dada8a931abb1a1ca974529d12d177 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:37:37 -0700
Subject: [PATCH 51/51] Lint

---
 modules/exllamav2.py | 2 +-
 modules/exllamav3.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 3b3233d2..5d5c5b56 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -3,7 +3,6 @@ import traceback
 from pathlib import Path
 
 import torch
-
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Cache,
@@ -16,6 +15,7 @@ from exllamav2 import (
     ExLlamaV2Tokenizer
 )
 from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
+
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 7fc6c5b1..66e25693 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -16,6 +16,7 @@ from exllamav3.generator.sampler import (
     SS_TopK,
     SS_TopP
 )
+
 from modules import shared
 from modules.image_utils import (
     convert_image_attachments_to_pil,