diff --git a/README.md b/README.md
index 907d8c38..6e59f7da 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
- 100% offline and private, with zero telemetry, external resources, or remote update requests.
- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
+- **Vision (multimodal models)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal%E2Tutorial)).
- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
- Aesthetic UI with dark and light themes.
- Syntax highlighting for code blocks and LaTeX rendering for mathematical expressions.
diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css
index 65af5f7a..583703c0 100644
--- a/css/chat_style-messenger.css
+++ b/css/chat_style-messenger.css
@@ -99,3 +99,9 @@
.message-body p em {
color: rgb(110 110 110) !important;
}
+.editing-textarea {
+ width: max(30rem) !important;
+}
+.circle-you + .text .edit-control-button, .circle-you + .text .editing-textarea {
+ color: #000 !important;
+}
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 9831ee8f..3e5ebe67 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -13,7 +13,7 @@
line-height: 28px !important;
}
-.dark .chat .message-body :is(p, li, q, h1, h2, h3, h4, h5, h6) {
+.dark .chat .message-body :is(p, li, q, em, h1, h2, h3, h4, h5, h6) {
color: #d1d5db !important;
}
diff --git a/css/main.css b/css/main.css
index 240a94d5..062d3eb2 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1577,6 +1577,20 @@ strong {
margin-top: 4px;
}
+.image-attachment {
+ flex-direction: column;
+ max-width: 314px;
+}
+
+.image-preview {
+ border-radius: 16px;
+ margin-bottom: 5px;
+ object-fit: cover;
+ object-position: center;
+ border: 2px solid var(--border-color-primary);
+ aspect-ratio: 1 / 1;
+}
+
button:focus {
outline: none;
}
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index ec999397..fd3309c7 100644
--- a/docs/12 - OpenAI API.md
+++ b/docs/12 - OpenAI API.md
@@ -77,6 +77,58 @@ curl http://127.0.0.1:5000/v1/chat/completions \
}'
```
+#### Multimodal/vision (llama.cpp and ExLlamaV3)
+
+##### With /v1/chat/completions (recommended!)
+
+```shell
+curl http://127.0.0.1:5000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "Please describe what you see in this image."},
+ {"type": "image_url", "image_url": {"url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"}}
+ ]
+ }
+ ]
+ }'
+```
+
+##### With /v1/completions
+
+```shell
+curl http://127.0.0.1:5000/v1/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": "About image <__media__> and image <__media__>, what I can say is that the first one"
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"
+ }
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/strawberry.png?raw=true"
+ }
+ }
+ ]
+ }
+ ]
+ }'
+```
+
#### SSE streaming
```shell
diff --git a/docs/Multimodal Tutorial.md b/docs/Multimodal Tutorial.md
new file mode 100644
index 00000000..a30889f7
--- /dev/null
+++ b/docs/Multimodal Tutorial.md
@@ -0,0 +1,66 @@
+## Getting started
+
+### 1. Find a multimodal model
+
+GGUF models with vision capabilities are uploaded along a `mmproj` file to Hugging Face.
+
+For instance, [unsloth/gemma-3-4b-it-GGUF](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/tree/main) has this:
+
+
+
+### 2. Download the model to `user_data/models`
+
+As an example, download
+
+https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/resolve/main/gemma-3-4b-it-Q4_K_S.gguf?download=true
+
+to your `text-generation-webui/user_data/models` folder.
+
+### 3. Download the associated mmproj file to `user_data/mmproj`
+
+Then download
+
+https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/resolve/main/mmproj-F16.gguf?download=true
+
+to your `text-generation-webui/user_data/mmproj` folder. Name it `mmproj-gemma-3-4b-it-F16.gguf` to give it a recognizable name.
+
+### 4. Load the model
+
+1. Launch the web UI
+2. Navigate to the Model tab
+3. Select the GGUF model in the Model dropdown:
+
+
+
+4. Select the mmproj file in the Multimodal (vision) menu:
+
+
+
+5. Click "Load"
+
+### 5. Send a message with an image
+
+Select your image by clicking on the 📎 icon and send your message:
+
+
+
+The model will reply with great understanding of the image contents:
+
+
+
+## Multimodal with ExLlamaV3
+
+Multimodal also works with the ExLlamaV3 loader (the non-HF one).
+
+No additional files are necessary, just load a multimodal EXL3 model and send an image.
+
+Examples of models that you can use:
+
+- https://huggingface.co/turboderp/gemma-3-27b-it-exl3
+- https://huggingface.co/turboderp/Mistral-Small-3.1-24B-Instruct-2503-exl3
+
+## Multimodal API examples
+
+In the page below you can find some ready-to-use examples:
+
+[Multimodal/vision (llama.cpp and ExLlamaV3)](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#multimodalvision-llamacpp-and-exllamav3)
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 5181b18b..c3037d0c 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -16,6 +16,8 @@ from modules.chat import (
load_character_memoized,
load_instruction_template_memoized
)
+from modules.image_utils import convert_openai_messages_to_images
+from modules.logging_colors import logger
from modules.presets import load_preset_memoized
from modules.text_generation import decode, encode, generate_reply
@@ -82,6 +84,33 @@ def process_parameters(body, is_legacy=False):
return generate_params
+def process_multimodal_content(content):
+ """Extract text and add image placeholders from OpenAI multimodal format"""
+ if isinstance(content, str):
+ return content
+
+ if isinstance(content, list):
+ text_parts = []
+ image_placeholders = ""
+ for item in content:
+ if not isinstance(item, dict):
+ continue
+
+ item_type = item.get('type', '')
+ if item_type == 'text':
+ text_parts.append(item.get('text', ''))
+ elif item_type == 'image_url':
+ image_placeholders += "<__media__>"
+
+ final_text = ' '.join(text_parts)
+ if image_placeholders:
+ return f"{image_placeholders}\n\n{final_text}"
+ else:
+ return final_text
+
+ return str(content)
+
+
def convert_history(history):
'''
Chat histories in this program are in the format [message, reply].
@@ -99,8 +128,11 @@ def convert_history(history):
role = entry["role"]
if role == "user":
+ # Extract text content (images handled by model-specific code)
+ content = process_multimodal_content(content)
user_input = content
user_input_last = True
+
if current_message:
chat_dialogue.append([current_message, '', ''])
current_message = ""
@@ -126,7 +158,11 @@ def convert_history(history):
if not user_input_last:
user_input = ""
- return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue)}
+ return user_input, system_message, {
+ 'internal': chat_dialogue,
+ 'visible': copy.deepcopy(chat_dialogue),
+ 'messages': history # Store original messages for multimodal models
+ }
def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, prompt_only=False) -> dict:
@@ -150,9 +186,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
elif m['role'] == 'function':
raise InvalidRequestError(message="role: function is not supported.", param='messages')
- if 'content' not in m and "image_url" not in m:
+ # Handle multimodal content validation
+ content = m.get('content')
+ if content is None:
raise InvalidRequestError(message="messages: missing content", param='messages')
+ # Validate multimodal content structure
+ if isinstance(content, list):
+ for item in content:
+ if not isinstance(item, dict) or 'type' not in item:
+ raise InvalidRequestError(message="messages: invalid content item format", param='messages')
+ if item['type'] not in ['text', 'image_url']:
+ raise InvalidRequestError(message="messages: unsupported content type", param='messages')
+ if item['type'] == 'text' and 'text' not in item:
+ raise InvalidRequestError(message="messages: missing text in content item", param='messages')
+ if item['type'] == 'image_url' and ('image_url' not in item or 'url' not in item['image_url']):
+ raise InvalidRequestError(message="messages: missing image_url in content item", param='messages')
+
# Chat Completions
object_type = 'chat.completion' if not stream else 'chat.completion.chunk'
created_time = int(time.time())
@@ -336,9 +386,26 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
prompt_str = 'context' if is_legacy else 'prompt'
- # ... encoded as a string, array of strings, array of tokens, or array of token arrays.
- if prompt_str not in body:
- raise InvalidRequestError("Missing required input", param=prompt_str)
+ # Handle both prompt and messages format for unified multimodal support
+ if prompt_str not in body or body[prompt_str] is None:
+ if 'messages' in body:
+ # Convert messages format to prompt for completions endpoint
+ prompt_text = ""
+ for message in body.get('messages', []):
+ if isinstance(message, dict) and 'content' in message:
+ # Extract text content from multimodal messages
+ content = message['content']
+ if isinstance(content, str):
+ prompt_text += content
+ elif isinstance(content, list):
+ for item in content:
+ if isinstance(item, dict) and item.get('type') == 'text':
+ prompt_text += item.get('text', '')
+
+ # Allow empty prompts for image-only requests
+ body[prompt_str] = prompt_text
+ else:
+ raise InvalidRequestError("Missing required input", param=prompt_str)
# common params
generate_params = process_parameters(body, is_legacy=is_legacy)
@@ -349,9 +416,22 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
suffix = body['suffix'] if body['suffix'] else ''
echo = body['echo']
+ # Add messages to generate_params if present for multimodal processing
+ if body.get('messages'):
+ generate_params['messages'] = body['messages']
+ raw_images = convert_openai_messages_to_images(generate_params['messages'])
+ if raw_images:
+ logger.info(f"Found {len(raw_images)} image(s) in request.")
+ generate_params['raw_images'] = raw_images
+
if not stream:
prompt_arg = body[prompt_str]
- if isinstance(prompt_arg, str) or (isinstance(prompt_arg, list) and isinstance(prompt_arg[0], int)):
+
+ # Handle empty/None prompts (e.g., image-only requests)
+ if prompt_arg is None:
+ prompt_arg = ""
+
+ if isinstance(prompt_arg, str) or (isinstance(prompt_arg, list) and len(prompt_arg) > 0 and isinstance(prompt_arg[0], int)):
prompt_arg = [prompt_arg]
resp_list_data = []
@@ -359,7 +439,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
total_prompt_token_count = 0
for idx, prompt in enumerate(prompt_arg, start=0):
- if isinstance(prompt[0], int):
+ if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], int):
# token lists
if requested_model == shared.model_name:
prompt = decode(prompt)[0]
@@ -448,7 +528,6 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
# generate reply #######################################
debug_msg({'prompt': prompt, 'generate_params': generate_params})
generator = generate_reply(prompt, generate_params, is_chat=False)
-
answer = ''
seen_content = ''
completion_token_count = 0
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 6bd3749f..56d91582 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -2,7 +2,7 @@ import json
import time
from typing import Dict, List, Optional
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, model_validator, validator
class GenerationOptions(BaseModel):
@@ -99,13 +99,14 @@ class ToolCall(BaseModel):
class CompletionRequestParams(BaseModel):
model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
- prompt: str | List[str]
+ prompt: str | List[str] | None = Field(default=None, description="Text prompt for completion. Can also use 'messages' format for multimodal.")
+ messages: List[dict] | None = Field(default=None, description="OpenAI messages format for multimodal support. Alternative to 'prompt'.")
best_of: int | None = Field(default=1, description="Unused parameter.")
echo: bool | None = False
frequency_penalty: float | None = 0
logit_bias: dict | None = None
logprobs: int | None = None
- max_tokens: int | None = 16
+ max_tokens: int | None = 512
n: int | None = Field(default=1, description="Unused parameter.")
presence_penalty: float | None = 0
stop: str | List[str] | None = None
@@ -115,6 +116,12 @@ class CompletionRequestParams(BaseModel):
top_p: float | None = 1
user: str | None = Field(default=None, description="Unused parameter.")
+ @model_validator(mode='after')
+ def validate_prompt_or_messages(self):
+ if self.prompt is None and self.messages is None:
+ raise ValueError("Either 'prompt' or 'messages' must be provided")
+ return self
+
class CompletionRequest(GenerationOptions, CompletionRequestParams):
pass
@@ -220,7 +227,7 @@ class LogitsRequestParams(BaseModel):
use_samplers: bool = False
top_logits: int | None = 50
frequency_penalty: float | None = 0
- max_tokens: int | None = 16
+ max_tokens: int | None = 512
presence_penalty: float | None = 0
temperature: float | None = 1
top_p: float | None = 1
diff --git a/js/main.js b/js/main.js
index e0f9314d..4b4b14c2 100644
--- a/js/main.js
+++ b/js/main.js
@@ -583,7 +583,7 @@ function moveToChatTab() {
const chatControlsFirstChild = document.querySelector("#chat-controls").firstElementChild;
const newParent = chatControlsFirstChild;
- let newPosition = newParent.children.length - 2;
+ let newPosition = newParent.children.length - 3;
newParent.insertBefore(grandParent, newParent.children[newPosition]);
document.getElementById("save-character").style.display = "none";
@@ -977,7 +977,7 @@ if (document.readyState === "loading") {
//------------------------------------------------
// File upload button
-document.querySelector("#chat-input .upload-button").title = "Upload text files, PDFs, and DOCX documents";
+document.querySelector("#chat-input .upload-button").title = "Upload text files, PDFs, DOCX documents, and images";
// Activate web search
document.getElementById("web-search").title = "Search the internet with DuckDuckGo";
diff --git a/modules/chat.py b/modules/chat.py
index 1ab91b5e..7b1629dd 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -271,16 +271,27 @@ def generate_chat_prompt(user_input, state, **kwargs):
# Add attachment content if present AND if past attachments are enabled
if (state.get('include_past_attachments', True) and user_key in metadata and "attachments" in metadata[user_key]):
attachments_text = ""
- for attachment in metadata[user_key]["attachments"]:
- filename = attachment.get("name", "file")
- content = attachment.get("content", "")
- if attachment.get("type") == "text/html" and attachment.get("url"):
- attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
- else:
- attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+ image_refs = ""
- if attachments_text:
- enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}"
+ for attachment in metadata[user_key]["attachments"]:
+ if attachment.get("type") == "image":
+ # Add image reference for multimodal models
+ image_refs += "<__media__>"
+ else:
+ # Handle text/PDF attachments
+ filename = attachment.get("name", "file")
+ content = attachment.get("content", "")
+ if attachment.get("type") == "text/html" and attachment.get("url"):
+ attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
+ else:
+ attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+ if image_refs or attachments_text:
+ enhanced_user_msg = user_msg
+ if image_refs:
+ enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}"
+ if attachments_text:
+ enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
@@ -301,16 +312,25 @@ def generate_chat_prompt(user_input, state, **kwargs):
if user_key in metadata and "attachments" in metadata[user_key]:
attachments_text = ""
- for attachment in metadata[user_key]["attachments"]:
- filename = attachment.get("name", "file")
- content = attachment.get("content", "")
- if attachment.get("type") == "text/html" and attachment.get("url"):
- attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
- else:
- attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+ image_refs = ""
- if attachments_text:
- user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}"
+ for attachment in metadata[user_key]["attachments"]:
+ if attachment.get("type") == "image":
+ image_refs += "<__media__>"
+ else:
+ filename = attachment.get("name", "file")
+ content = attachment.get("content", "")
+ if attachment.get("type") == "text/html" and attachment.get("url"):
+ attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
+ else:
+ attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+ if image_refs or attachments_text:
+ user_input = user_input
+ if image_refs:
+ user_input = f"{image_refs}\n\n{user_input}"
+ if attachments_text:
+ user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
messages.append({"role": "user", "content": user_input})
@@ -594,29 +614,63 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
file_extension = path.suffix.lower()
try:
- # Handle different file types
- if file_extension == '.pdf':
+ # Handle image files
+ if file_extension in ['.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif']:
+ # Convert image to base64
+ with open(path, 'rb') as f:
+ image_data = base64.b64encode(f.read()).decode('utf-8')
+
+ # Determine MIME type from extension
+ mime_type_map = {
+ '.jpg': 'image/jpeg',
+ '.jpeg': 'image/jpeg',
+ '.png': 'image/png',
+ '.webp': 'image/webp',
+ '.bmp': 'image/bmp',
+ '.gif': 'image/gif'
+ }
+ mime_type = mime_type_map.get(file_extension, 'image/jpeg')
+
+ # Format as data URL
+ data_url = f"data:{mime_type};base64,{image_data}"
+
+ # Generate unique image ID
+ image_id = len([att for att in history['metadata'][key]["attachments"] if att.get("type") == "image"]) + 1
+
+ attachment = {
+ "name": filename,
+ "type": "image",
+ "image_data": data_url,
+ "image_id": image_id,
+ }
+ elif file_extension == '.pdf':
# Process PDF file
content = extract_pdf_text(path)
- file_type = "application/pdf"
+ attachment = {
+ "name": filename,
+ "type": "application/pdf",
+ "content": content,
+ }
elif file_extension == '.docx':
content = extract_docx_text(path)
- file_type = "application/docx"
+ attachment = {
+ "name": filename,
+ "type": "application/docx",
+ "content": content,
+ }
else:
# Default handling for text files
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
- file_type = "text/plain"
- # Add attachment
- attachment = {
- "name": filename,
- "type": file_type,
- "content": content,
- }
+ attachment = {
+ "name": filename,
+ "type": "text/plain",
+ "content": content,
+ }
history['metadata'][key]["attachments"].append(attachment)
- return content # Return the content for reuse
+ return attachment # Return the attachment for reuse
except Exception as e:
logger.error(f"Error processing attachment {filename}: {e}")
return None
@@ -814,6 +868,22 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
'metadata': output['metadata']
}
+ row_idx = len(output['internal']) - 1
+
+ # Collect image attachments for multimodal generation from the entire history
+ all_image_attachments = []
+ if 'metadata' in output:
+ for i in range(len(output['internal'])):
+ user_key = f"user_{i}"
+ if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
+ for attachment in output['metadata'][user_key]["attachments"]:
+ if attachment.get("type") == "image":
+ all_image_attachments.append(attachment)
+
+ # Add all collected image attachments to state for the generation
+ if all_image_attachments:
+ state['image_attachments'] = all_image_attachments
+
# Generate the prompt
kwargs = {
'_continue': _continue,
@@ -828,7 +898,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
prompt = generate_chat_prompt(text, state, **kwargs)
# Add timestamp for assistant's response at the start of generation
- row_idx = len(output['internal']) - 1
update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp(), model_name=shared.model_name)
# Generate
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 6bb422ea..5d5c5b56 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -135,7 +135,8 @@ class Exllamav2Model:
return result, result
def encode(self, string, **kwargs):
- return self.tokenizer.encode(string, add_bos=True, encode_special_tokens=True)
+ add_bos = kwargs.pop('add_bos', True)
+ return self.tokenizer.encode(string, add_bos=add_bos, encode_special_tokens=True, **kwargs)
def decode(self, ids, **kwargs):
if isinstance(ids, list):
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
new file mode 100644
index 00000000..66e25693
--- /dev/null
+++ b/modules/exllamav3.py
@@ -0,0 +1,409 @@
+import traceback
+from pathlib import Path
+from typing import Any, List, Tuple
+
+from exllamav3 import Cache, Config, Generator, Model, Tokenizer
+from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
+from exllamav3.generator import Job
+from exllamav3.generator.sampler import (
+ CustomSampler,
+ SS_Argmax,
+ SS_MinP,
+ SS_PresFreqP,
+ SS_RepP,
+ SS_Sample,
+ SS_Temperature,
+ SS_TopK,
+ SS_TopP
+)
+
+from modules import shared
+from modules.image_utils import (
+ convert_image_attachments_to_pil,
+ convert_openai_messages_to_images
+)
+from modules.logging_colors import logger
+from modules.text_generation import get_max_prompt_length
+
+try:
+ import flash_attn
+except Exception:
+ logger.warning('Failed to load flash-attention due to the following error:\n')
+ traceback.print_exc()
+
+
+class Exllamav3Model:
+ def __init__(self):
+ pass
+
+ @classmethod
+ def from_pretrained(cls, path_to_model):
+ path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
+
+ # Reset global MMTokenAllocator to prevent token ID corruption when switching models
+ from exllamav3.tokenizer.mm_embedding import (
+ FIRST_MM_EMBEDDING_INDEX,
+ global_allocator
+ )
+ global_allocator.next_token_index = FIRST_MM_EMBEDDING_INDEX
+
+ config = Config.from_directory(str(path_to_model))
+ model = Model.from_config(config)
+
+ # Calculate the closest multiple of 256 at or above the chosen value
+ max_tokens = shared.args.ctx_size
+ if max_tokens % 256 != 0:
+ adjusted_tokens = ((max_tokens // 256) + 1) * 256
+ logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}")
+ max_tokens = adjusted_tokens
+
+ # Parse cache type (ExLlamaV2 pattern)
+ cache_type = shared.args.cache_type.lower()
+ cache_kwargs = {}
+ if cache_type == 'fp16':
+ layer_type = CacheLayer_fp16
+ elif cache_type.startswith('q'):
+ layer_type = CacheLayer_quant
+ if '_' in cache_type:
+ # Different bits for k and v (e.g., q4_q8)
+ k_part, v_part = cache_type.split('_')
+ k_bits = int(k_part[1:])
+ v_bits = int(v_part[1:])
+ else:
+ # Same bits for k and v (e.g., q4)
+ k_bits = v_bits = int(cache_type[1:])
+
+ # Validate bit ranges
+ if not (2 <= k_bits <= 8 and 2 <= v_bits <= 8):
+ logger.warning(f"Invalid quantization bits: k_bits={k_bits}, v_bits={v_bits}. Must be between 2 and 8. Falling back to fp16.")
+ layer_type = CacheLayer_fp16
+ else:
+ cache_kwargs = {'k_bits': k_bits, 'v_bits': v_bits}
+ else:
+ logger.warning(f"Unrecognized cache type: {cache_type}. Falling back to fp16.")
+ layer_type = CacheLayer_fp16
+
+ cache = Cache(model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs)
+
+ load_params = {'progressbar': True}
+ split = None
+ if shared.args.gpu_split:
+ split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+ load_params['use_per_device'] = split
+
+ model.load(**load_params)
+ tokenizer = Tokenizer.from_config(config)
+
+ # Initialize draft model for speculative decoding
+ draft_model = None
+ draft_cache = None
+ if shared.args.model_draft and shared.args.model_draft.lower() not in ["", "none"]:
+ logger.info(f"Loading draft model for speculative decoding: {shared.args.model_draft}")
+
+ draft_path = Path(shared.args.model_draft)
+ if not draft_path.is_dir():
+ draft_path = Path(f'{shared.args.model_dir}') / Path(shared.args.model_draft)
+
+ if not draft_path.is_dir():
+ logger.warning(f"Draft model not found at {draft_path}, speculative decoding disabled.")
+ else:
+ draft_config = Config.from_directory(str(draft_path))
+
+ # Set context size for draft model with 256-multiple validation
+ if shared.args.ctx_size_draft > 0:
+ draft_max_tokens = shared.args.ctx_size_draft
+ else:
+ draft_max_tokens = shared.args.ctx_size
+
+ # Validate draft model context size is a multiple of 256
+ if draft_max_tokens % 256 != 0:
+ adjusted_draft_tokens = ((draft_max_tokens // 256) + 1) * 256
+ logger.warning(f"Draft model max_num_tokens must be a multiple of 256. Adjusting from {draft_max_tokens} to {adjusted_draft_tokens}")
+ draft_max_tokens = adjusted_draft_tokens
+
+ draft_config.max_seq_len = draft_max_tokens
+
+ draft_model = Model.from_config(draft_config)
+ draft_cache = Cache(draft_model, max_num_tokens=draft_max_tokens, layer_type=layer_type, **cache_kwargs)
+
+ draft_load_params = {'progressbar': True}
+ if split:
+ draft_load_params['use_per_device'] = split
+
+ draft_model.load(**draft_load_params)
+ logger.info(f"Draft model loaded successfully. Max speculative tokens: {shared.args.draft_max}")
+
+ # Load vision model component (ExLlamaV3 native)
+ vision_model = None
+ if "vision_config" in config.config_dict:
+ logger.info("Vision component detected in model config. Attempting to load...")
+ try:
+ vision_model = Model.from_config(config, component="vision")
+ vision_model.load(progressbar=True)
+ logger.info("Vision model loaded successfully.")
+ except Exception as e:
+ logger.warning(f"Vision model loading failed (multimodal disabled): {e}")
+ else:
+ logger.info("No vision component in model config. Skipping multimodal setup.")
+
+ generator = Generator(
+ model=model,
+ cache=cache,
+ tokenizer=tokenizer,
+ draft_model=draft_model,
+ draft_cache=draft_cache,
+ num_speculative_tokens=shared.args.draft_max if draft_model is not None else 0,
+ )
+
+ result = cls()
+ result.model = model
+ result.cache = cache
+ result.tokenizer = tokenizer
+ result.generator = generator
+ result.config = config
+ result.max_tokens = max_tokens
+ result.vision_model = vision_model
+ result.draft_model = draft_model
+ result.draft_cache = draft_cache
+
+ return result
+
+ def is_multimodal(self) -> bool:
+ """Check if this model supports multimodal input."""
+ return hasattr(self, 'vision_model') and self.vision_model is not None
+
+ def _process_images_for_generation(self, prompt: str, state: dict) -> Tuple[str, List[Any]]:
+ """
+ Process all possible image inputs and return modified prompt + embeddings.
+ Returns: (processed_prompt, image_embeddings)
+ """
+ if not self.is_multimodal():
+ return prompt, []
+
+ # Collect images from various sources using shared utilities
+ pil_images = []
+
+ # From webui image_attachments (preferred format)
+ if 'image_attachments' in state and state['image_attachments']:
+ pil_images.extend(convert_image_attachments_to_pil(state['image_attachments']))
+ # From OpenAI API raw_images
+ elif 'raw_images' in state and state['raw_images']:
+ pil_images.extend(state['raw_images'])
+ # From OpenAI API messages format
+ elif 'messages' in state and state['messages']:
+ pil_images.extend(convert_openai_messages_to_images(state['messages']))
+
+ if not pil_images:
+ return prompt, []
+
+ # ExLlamaV3-specific: Generate embeddings
+ try:
+ # Use pre-computed embeddings if available (proper MMEmbedding lifetime)
+ if 'image_embeddings' in state and state['image_embeddings']:
+ # Use existing embeddings - this preserves MMEmbedding lifetime
+ image_embeddings = state['image_embeddings']
+ else:
+ # Do not reset the cache/allocator index; it causes token ID conflicts during generation.
+ logger.info(f"Processing {len(pil_images)} image(s) with ExLlamaV3 vision model")
+ image_embeddings = [
+ self.vision_model.get_image_embeddings(tokenizer=self.tokenizer, image=img)
+ for img in pil_images
+ ]
+
+ # ExLlamaV3-specific: Handle prompt processing with placeholders
+ placeholders = [ie.text_alias for ie in image_embeddings]
+
+ if '<__media__>' in prompt:
+ # Web chat: Replace <__media__> placeholders
+ for alias in placeholders:
+ prompt = prompt.replace('<__media__>', alias, 1)
+ logger.info(f"Replaced {len(placeholders)} <__media__> placeholder(s)")
+ else:
+ # API: Prepend embedding aliases
+ combined_placeholders = "\n".join(placeholders)
+ prompt = combined_placeholders + "\n" + prompt
+ logger.info(f"Prepended {len(placeholders)} embedding(s) to prompt")
+
+ return prompt, image_embeddings
+
+ except Exception as e:
+ logger.error(f"Failed to process images: {e}")
+ return prompt, []
+
+ def generate_with_streaming(self, prompt, state):
+ """
+ Generate text with streaming using native ExLlamaV3 API
+ """
+ # Process images and modify prompt (ExLlamaV3-specific)
+ prompt, image_embeddings = self._process_images_for_generation(prompt, state)
+
+ # Greedy decoding is a special case
+ if state['temperature'] == 0:
+ sampler = CustomSampler([SS_Argmax()])
+ else:
+ # 1. Create a list of all active, unordered samplers
+ unordered_samplers = []
+
+ # Penalties
+ penalty_range = state['repetition_penalty_range']
+ if penalty_range <= 0:
+ penalty_range = int(10e7) # Use large number for "full context"
+ rep_decay = 0 # Not a configurable parameter
+
+ # Add penalty samplers if they are active
+ if state['repetition_penalty'] != 1.0:
+ unordered_samplers.append(SS_RepP(state['repetition_penalty'], penalty_range, rep_decay))
+ if state['presence_penalty'] != 0.0 or state['frequency_penalty'] != 0.0:
+ unordered_samplers.append(SS_PresFreqP(state['presence_penalty'], state['frequency_penalty'], penalty_range, rep_decay))
+
+ # Standard samplers
+ if state['top_k'] > 0:
+ unordered_samplers.append(SS_TopK(state['top_k']))
+ if state['top_p'] < 1.0:
+ unordered_samplers.append(SS_TopP(state['top_p']))
+ if state['min_p'] > 0.0:
+ unordered_samplers.append(SS_MinP(state['min_p']))
+
+ # Temperature (SS_NoOp is returned if temp is 1.0)
+ unordered_samplers.append(SS_Temperature(state['temperature']))
+
+ # 2. Define the mapping from class names to the priority list keys
+ class_name_to_nickname = {
+ 'SS_RepP': 'repetition_penalty',
+ 'SS_PresFreqP': 'presence_frequency_penalty',
+ 'SS_TopK': 'top_k',
+ 'SS_TopP': 'top_p',
+ 'SS_MinP': 'min_p',
+ 'SS_Temperature': 'temperature',
+ }
+
+ # 3. Get the priority list and handle temperature_last
+ default_priority = ['repetition_penalty', 'presence_frequency_penalty', 'top_k', 'top_p', 'min_p', 'temperature']
+ sampler_priority = state.get('sampler_priority') or default_priority
+
+ if state['temperature_last'] and 'temperature' in sampler_priority:
+ sampler_priority.append(sampler_priority.pop(sampler_priority.index('temperature')))
+
+ # 4. Sort the unordered list based on the priority list
+ def custom_sort_key(sampler_obj):
+ class_name = sampler_obj.__class__.__name__
+ nickname = class_name_to_nickname.get(class_name)
+ if nickname and nickname in sampler_priority:
+ return sampler_priority.index(nickname)
+ return -1
+
+ ordered_samplers = sorted(unordered_samplers, key=custom_sort_key)
+
+ # 5. Add the final sampling stage and build the sampler
+ ordered_samplers.append(SS_Sample())
+ sampler = CustomSampler(ordered_samplers)
+
+ # Encode prompt with embeddings (ExLlamaV3-specific)
+ input_ids = self.tokenizer.encode(
+ prompt,
+ add_bos=state['add_bos_token'],
+ encode_special_tokens=True,
+ embeddings=image_embeddings,
+ )
+
+ input_ids = input_ids[:, -get_max_prompt_length(state):]
+
+ self._last_prompt_token_count = input_ids.shape[-1]
+
+ # Determine max_new_tokens
+ if state['auto_max_new_tokens']:
+ max_new_tokens = state['truncation_length'] - self._last_prompt_token_count
+ else:
+ max_new_tokens = state['max_new_tokens']
+
+ # Get stop conditions
+ stop_conditions = []
+ if not state['ban_eos_token']:
+ if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
+ stop_conditions.append(self.tokenizer.eos_token_id)
+
+ job = Job(
+ input_ids=input_ids,
+ max_new_tokens=max_new_tokens,
+ decode_special_tokens=not state['skip_special_tokens'],
+ embeddings=image_embeddings if image_embeddings else None,
+ sampler=sampler,
+ stop_conditions=stop_conditions if stop_conditions else None,
+ )
+
+ # Stream generation
+ self.generator.enqueue(job)
+
+ response_text = ""
+
+ try:
+ while self.generator.num_remaining_jobs():
+ results = self.generator.iterate()
+ for result in results:
+ if "eos" in result and result["eos"]:
+ break
+
+ chunk = result.get("text", "")
+ if chunk:
+ response_text += chunk
+ yield response_text
+
+ finally:
+ self.generator.clear_queue()
+
+ def generate(self, prompt, state):
+ output = ""
+ for chunk in self.generate_with_streaming(prompt, state):
+ output = chunk
+
+ return output
+
+ def encode(self, string, **kwargs):
+ add_bos = kwargs.pop('add_bos', True)
+ return self.tokenizer.encode(string, add_bos=add_bos, **kwargs)
+
+ def decode(self, ids, **kwargs):
+ return self.tokenizer.decode(ids, **kwargs)
+
+ @property
+ def last_prompt_token_count(self):
+ return getattr(self, '_last_prompt_token_count', 0)
+
+ def unload(self):
+ logger.info("Unloading ExLlamaV3 model components...")
+
+ if hasattr(self, 'vision_model') and self.vision_model is not None:
+ try:
+ del self.vision_model
+ except Exception as e:
+ logger.warning(f"Error unloading vision model: {e}")
+ self.vision_model = None
+
+ if hasattr(self, 'draft_model') and self.draft_model is not None:
+ try:
+ self.draft_model.unload()
+ del self.draft_model
+ except Exception as e:
+ logger.warning(f"Error unloading draft model: {e}")
+ self.draft_model = None
+
+ if hasattr(self, 'draft_cache') and self.draft_cache is not None:
+ self.draft_cache = None
+
+ if hasattr(self, 'model') and self.model is not None:
+ try:
+ self.model.unload()
+ del self.model
+ except Exception as e:
+ logger.warning(f"Error unloading main model: {e}")
+
+ self.model = None
+
+ if hasattr(self, 'cache') and self.cache is not None:
+ self.cache = None
+
+ if hasattr(self, 'generator') and self.generator is not None:
+ self.generator = None
+
+ if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+ self.tokenizer = None
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 79237f7f..cb14a722 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -406,16 +406,26 @@ def format_message_attachments(history, role, index):
for attachment in attachments:
name = html.escape(attachment["name"])
- # Make clickable if URL exists
- if "url" in attachment:
- name = f'{name}'
+ if attachment.get("type") == "image":
+ image_data = attachment.get("image_data", "")
+ attachments_html += (
+ f'