From b391ac8eb1ba63e449f0ef021db56d6513dce646 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 8 Aug 2025 17:51:24 -0700 Subject: [PATCH 01/51] Fix getting the ctx-size for EXL3/EXL2/Transformers models --- modules/models_settings.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index e35e1c04..4e53dc81 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -106,9 +106,16 @@ def get_model_metadata(model): for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']: if k in metadata: - model_settings['truncation_length'] = metadata[k] - model_settings['truncation_length_info'] = metadata[k] - model_settings['ctx_size'] = min(metadata[k], 8192) + value = metadata[k] + elif k in metadata.get('text_config', {}): + value = metadata['text_config'][k] + else: + continue + + model_settings['truncation_length'] = value + model_settings['truncation_length_info'] = value + model_settings['ctx_size'] = min(value, 8192) + break if 'rope_theta' in metadata: model_settings['rope_freq_base'] = metadata['rope_theta'] From 88127f46c124723554b5e87cad9c868348ed4c53 Mon Sep 17 00:00:00 2001 From: Katehuuh <133996730+Katehuuh@users.noreply.github.com> Date: Sat, 9 Aug 2025 04:31:16 +0200 Subject: [PATCH 02/51] Add multimodal support (ExLlamaV3) (#7174) --- css/main.css | 13 ++ docs/12 - OpenAI API.md | 18 ++ extensions/openai/completions.py | 102 +++++++++- extensions/openai/image_utils.py | 97 ++++++++++ extensions/openai/typing.py | 16 +- modules/chat.py | 126 ++++++++++--- modules/exllamav3.py | 313 +++++++++++++++++++++++++++++++ modules/html_generator.py | 29 ++- modules/loaders.py | 40 ++++ modules/models.py | 13 +- modules/shared.py | 2 + modules/text_generation.py | 10 +- modules/ui_chat.py | 2 +- 13 files changed, 726 insertions(+), 55 deletions(-) create mode 100644 extensions/openai/image_utils.py create mode 100644 modules/exllamav3.py diff --git a/css/main.css b/css/main.css index 240a94d5..de16d81d 100644 --- a/css/main.css +++ b/css/main.css @@ -1577,6 +1577,19 @@ strong { margin-top: 4px; } +.image-attachment { + flex-direction: column; +} + +.image-preview { + border-radius: 16px; + margin-bottom: 5px; + object-fit: cover; + object-position: center; + border: 2px solid var(--border-color-primary); + aspect-ratio: 1 / 1; +} + button:focus { outline: none; } diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index ec999397..b7b5fbc1 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -77,6 +77,24 @@ curl http://127.0.0.1:5000/v1/chat/completions \ }' ``` +#### Multimodal support (ExLlamaV3) + +```shell +curl http://127.0.0.1:5000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What color is this image?"}, + {"type": "image_url", "image_url": {"url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"}} + ] + } + ] + }' +``` + #### SSE streaming ```shell diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 5181b18b..3d389f0b 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -7,6 +7,7 @@ import tiktoken from pydantic import ValidationError from extensions.openai.errors import InvalidRequestError +from extensions.openai.image_utils import convert_openai_messages_to_images from extensions.openai.typing import ToolDefinition from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall from modules import shared @@ -16,6 +17,7 @@ from modules.chat import ( load_character_memoized, load_instruction_template_memoized ) +from modules.logging_colors import logger from modules.presets import load_preset_memoized from modules.text_generation import decode, encode, generate_reply @@ -82,6 +84,21 @@ def process_parameters(body, is_legacy=False): return generate_params +def process_multimodal_content(content): + """Extract text from OpenAI multimodal format for non-multimodal models""" + if isinstance(content, str): + return content + + if isinstance(content, list): + text_parts = [] + for item in content: + if isinstance(item, dict) and item.get('type') == 'text': + text_parts.append(item.get('text', '')) + return ' '.join(text_parts) if text_parts else str(content) + + return str(content) + + def convert_history(history): ''' Chat histories in this program are in the format [message, reply]. @@ -99,8 +116,11 @@ def convert_history(history): role = entry["role"] if role == "user": + # Extract text content (images handled by model-specific code) + content = process_multimodal_content(content) user_input = content user_input_last = True + if current_message: chat_dialogue.append([current_message, '', '']) current_message = "" @@ -126,7 +146,11 @@ def convert_history(history): if not user_input_last: user_input = "" - return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue)} + return user_input, system_message, { + 'internal': chat_dialogue, + 'visible': copy.deepcopy(chat_dialogue), + 'messages': history # Store original messages for multimodal models + } def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, prompt_only=False) -> dict: @@ -150,9 +174,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p elif m['role'] == 'function': raise InvalidRequestError(message="role: function is not supported.", param='messages') - if 'content' not in m and "image_url" not in m: + # Handle multimodal content validation + content = m.get('content') + if content is None: raise InvalidRequestError(message="messages: missing content", param='messages') + # Validate multimodal content structure + if isinstance(content, list): + for item in content: + if not isinstance(item, dict) or 'type' not in item: + raise InvalidRequestError(message="messages: invalid content item format", param='messages') + if item['type'] not in ['text', 'image_url']: + raise InvalidRequestError(message="messages: unsupported content type", param='messages') + if item['type'] == 'text' and 'text' not in item: + raise InvalidRequestError(message="messages: missing text in content item", param='messages') + if item['type'] == 'image_url' and ('image_url' not in item or 'url' not in item['image_url']): + raise InvalidRequestError(message="messages: missing image_url in content item", param='messages') + # Chat Completions object_type = 'chat.completion' if not stream else 'chat.completion.chunk' created_time = int(time.time()) @@ -336,9 +374,26 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False): prompt_str = 'context' if is_legacy else 'prompt' - # ... encoded as a string, array of strings, array of tokens, or array of token arrays. - if prompt_str not in body: - raise InvalidRequestError("Missing required input", param=prompt_str) + # Handle both prompt and messages format for unified multimodal support + if prompt_str not in body or body[prompt_str] is None: + if 'messages' in body: + # Convert messages format to prompt for completions endpoint + prompt_text = "" + for message in body.get('messages', []): + if isinstance(message, dict) and 'content' in message: + # Extract text content from multimodal messages + content = message['content'] + if isinstance(content, str): + prompt_text += content + elif isinstance(content, list): + for item in content: + if isinstance(item, dict) and item.get('type') == 'text': + prompt_text += item.get('text', '') + + # Allow empty prompts for image-only requests + body[prompt_str] = prompt_text + else: + raise InvalidRequestError("Missing required input", param=prompt_str) # common params generate_params = process_parameters(body, is_legacy=is_legacy) @@ -349,9 +404,18 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False): suffix = body['suffix'] if body['suffix'] else '' echo = body['echo'] + # Add messages to generate_params if present for multimodal processing + if 'messages' in body: + generate_params['messages'] = body['messages'] + if not stream: prompt_arg = body[prompt_str] - if isinstance(prompt_arg, str) or (isinstance(prompt_arg, list) and isinstance(prompt_arg[0], int)): + + # Handle empty/None prompts (e.g., image-only requests) + if prompt_arg is None: + prompt_arg = "" + + if isinstance(prompt_arg, str) or (isinstance(prompt_arg, list) and len(prompt_arg) > 0 and isinstance(prompt_arg[0], int)): prompt_arg = [prompt_arg] resp_list_data = [] @@ -374,7 +438,19 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False): # generate reply ####################################### debug_msg({'prompt': prompt, 'generate_params': generate_params}) - generator = generate_reply(prompt, generate_params, is_chat=False) + + # Use multimodal generation if images are present + if 'messages' in generate_params: + raw_images = convert_openai_messages_to_images(generate_params['messages']) + if raw_images: + logger.info(f"Using multimodal generation for {len(raw_images)} images") + generate_params['raw_images'] = raw_images + generator = shared.model.generate_with_streaming(prompt, generate_params) + else: + generator = generate_reply(prompt, generate_params, is_chat=False) + else: + generator = generate_reply(prompt, generate_params, is_chat=False) + answer = '' for a in generator: @@ -447,7 +523,17 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False): # generate reply ####################################### debug_msg({'prompt': prompt, 'generate_params': generate_params}) - generator = generate_reply(prompt, generate_params, is_chat=False) + # Use multimodal generation if images are present + if 'messages' in generate_params: + raw_images = convert_openai_messages_to_images(generate_params['messages']) + if raw_images: + logger.info(f"Using multimodal generation for {len(raw_images)} images") + generate_params['raw_images'] = raw_images + generator = shared.model.generate_with_streaming(prompt, generate_params) + else: + generator = generate_reply(prompt, generate_params, is_chat=False) + else: + generator = generate_reply(prompt, generate_params, is_chat=False) answer = '' seen_content = '' diff --git a/extensions/openai/image_utils.py b/extensions/openai/image_utils.py new file mode 100644 index 00000000..c54f0532 --- /dev/null +++ b/extensions/openai/image_utils.py @@ -0,0 +1,97 @@ +""" +Shared image processing utilities for multimodal support. +Used by both ExLlamaV3 and llama.cpp implementations. +""" +import base64 +import io +from typing import Any, List, Tuple + +from PIL import Image + +from modules.logging_colors import logger + + +def decode_base64_image(base64_string: str) -> Image.Image: + """Decodes a base64 string to a PIL Image.""" + try: + if base64_string.startswith('data:image/'): + base64_string = base64_string.split(',', 1)[1] + + image_data = base64.b64decode(base64_string) + image = Image.open(io.BytesIO(image_data)) + return image + except Exception as e: + logger.error(f"Failed to decode base64 image: {e}") + raise ValueError(f"Invalid base64 image data: {e}") + + +def process_message_content(content: Any) -> Tuple[str, List[Image.Image]]: + """ + Processes message content that may contain text and images. + Returns: A tuple of (text_content, list_of_pil_images). + """ + if isinstance(content, str): + return content, [] + + if isinstance(content, list): + text_parts = [] + images = [] + for item in content: + if not isinstance(item, dict): + continue + + item_type = item.get('type', '') + if item_type == 'text': + text_parts.append(item.get('text', '')) + elif item_type == 'image_url': + image_url_data = item.get('image_url', {}) + image_url = image_url_data.get('url', '') + + if image_url.startswith('data:image/'): + try: + images.append(decode_base64_image(image_url)) + except Exception as e: + logger.warning(f"Failed to process a base64 image: {e}") + elif image_url.startswith('http'): + # Support external URLs + try: + import requests + response = requests.get(image_url, timeout=10) + response.raise_for_status() + image_data = response.content + image = Image.open(io.BytesIO(image_data)) + images.append(image) + logger.info("Successfully loaded external image from URL") + except Exception as e: + logger.warning(f"Failed to fetch external image: {e}") + else: + logger.warning(f"Unsupported image URL format: {image_url[:70]}...") + + return ' '.join(text_parts), images + + return str(content), [] + + +def convert_image_attachments_to_pil(image_attachments: List[dict]) -> List[Image.Image]: + """Convert webui image_attachments format to PIL Images.""" + pil_images = [] + for attachment in image_attachments: + if attachment.get('type') == 'image' and 'image_data' in attachment: + try: + image = decode_base64_image(attachment['image_data']) + if image.mode != 'RGB': + image = image.convert('RGB') + pil_images.append(image) + except Exception as e: + logger.warning(f"Failed to process image attachment: {e}") + return pil_images + + +def convert_openai_messages_to_images(messages: List[dict]) -> List[Image.Image]: + """Convert OpenAI messages format to PIL Images.""" + all_images = [] + for message in messages: + if isinstance(message, dict) and 'content' in message: + _, images = process_message_content(message['content']) + all_images.extend(images) + return all_images diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index 6bd3749f..e9f92da5 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -2,7 +2,7 @@ import json import time from typing import Dict, List, Optional -from pydantic import BaseModel, Field, validator +from pydantic import BaseModel, Field, field_validator, validator class GenerationOptions(BaseModel): @@ -99,7 +99,8 @@ class ToolCall(BaseModel): class CompletionRequestParams(BaseModel): model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.") - prompt: str | List[str] + prompt: str | List[str] | None = Field(default=None, description="Text prompt for completion. Can also use 'messages' format for multimodal.") + messages: List[dict] | None = Field(default=None, description="OpenAI messages format for multimodal support. Alternative to 'prompt'.") best_of: int | None = Field(default=1, description="Unused parameter.") echo: bool | None = False frequency_penalty: float | None = 0 @@ -115,6 +116,17 @@ class CompletionRequestParams(BaseModel): top_p: float | None = 1 user: str | None = Field(default=None, description="Unused parameter.") + @field_validator('prompt', 'messages') + @classmethod + def validate_prompt_or_messages(cls, v, info): + """Ensure either 'prompt' or 'messages' is provided for completions.""" + if info.field_name == 'prompt': # If we're validating 'prompt', check if neither prompt nor messages will be set + messages = info.data.get('messages') + if v is None and messages is None: + raise ValueError("Either 'prompt' or 'messages' must be provided") + + return v + class CompletionRequest(GenerationOptions, CompletionRequestParams): pass diff --git a/modules/chat.py b/modules/chat.py index 1ab91b5e..354ae46b 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -271,16 +271,27 @@ def generate_chat_prompt(user_input, state, **kwargs): # Add attachment content if present AND if past attachments are enabled if (state.get('include_past_attachments', True) and user_key in metadata and "attachments" in metadata[user_key]): attachments_text = "" - for attachment in metadata[user_key]["attachments"]: - filename = attachment.get("name", "file") - content = attachment.get("content", "") - if attachment.get("type") == "text/html" and attachment.get("url"): - attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n" - else: - attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + image_refs = "" - if attachments_text: - enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}" + for attachment in metadata[user_key]["attachments"]: + if attachment.get("type") == "image": + # Add image reference for multimodal models + image_refs += "<__media__>" + else: + # Handle text/PDF attachments + filename = attachment.get("name", "file") + content = attachment.get("content", "") + if attachment.get("type") == "text/html" and attachment.get("url"): + attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n" + else: + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + + if image_refs or attachments_text: + enhanced_user_msg = user_msg + if image_refs: + enhanced_user_msg += f" {image_refs}" + if attachments_text: + enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}" messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg}) @@ -301,16 +312,23 @@ def generate_chat_prompt(user_input, state, **kwargs): if user_key in metadata and "attachments" in metadata[user_key]: attachments_text = "" - for attachment in metadata[user_key]["attachments"]: - filename = attachment.get("name", "file") - content = attachment.get("content", "") - if attachment.get("type") == "text/html" and attachment.get("url"): - attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n" - else: - attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + image_refs = "" - if attachments_text: - user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}" + for attachment in metadata[user_key]["attachments"]: + if attachment.get("type") == "image": + image_refs += "<__media__>" + else: + filename = attachment.get("name", "file") + content = attachment.get("content", "") + if attachment.get("type") == "text/html" and attachment.get("url"): + attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n" + else: + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + + if image_refs or attachments_text: + user_input = f"{user_input} {image_refs}" + if attachments_text: + user_input += f"\n\nATTACHMENTS:\n{attachments_text}" messages.append({"role": "user", "content": user_input}) @@ -594,29 +612,64 @@ def add_message_attachment(history, row_idx, file_path, is_user=True): file_extension = path.suffix.lower() try: - # Handle different file types - if file_extension == '.pdf': + # Handle image files + if file_extension in ['.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif']: + # Convert image to base64 + with open(path, 'rb') as f: + image_data = base64.b64encode(f.read()).decode('utf-8') + + # Determine MIME type from extension + mime_type_map = { + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.png': 'image/png', + '.webp': 'image/webp', + '.bmp': 'image/bmp', + '.gif': 'image/gif' + } + mime_type = mime_type_map.get(file_extension, 'image/jpeg') + + # Format as data URL + data_url = f"data:{mime_type};base64,{image_data}" + + # Generate unique image ID + image_id = len([att for att in history['metadata'][key]["attachments"] if att.get("type") == "image"]) + 1 + + attachment = { + "name": filename, + "type": "image", + "image_data": data_url, + "image_id": image_id, + "file_path": str(path) # For UI preview + } + elif file_extension == '.pdf': # Process PDF file content = extract_pdf_text(path) - file_type = "application/pdf" + attachment = { + "name": filename, + "type": "application/pdf", + "content": content, + } elif file_extension == '.docx': content = extract_docx_text(path) - file_type = "application/docx" + attachment = { + "name": filename, + "type": "application/docx", + "content": content, + } else: # Default handling for text files with open(path, 'r', encoding='utf-8') as f: content = f.read() - file_type = "text/plain" - # Add attachment - attachment = { - "name": filename, - "type": file_type, - "content": content, - } + attachment = { + "name": filename, + "type": "text/plain", + "content": content, + } history['metadata'][key]["attachments"].append(attachment) - return content # Return the content for reuse + return attachment # Return the attachment for reuse except Exception as e: logger.error(f"Error processing attachment {filename}: {e}") return None @@ -759,6 +812,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess for file_path in files: add_message_attachment(output, row_idx, file_path, is_user=True) + # Collect image attachments for ExLlamaV3 + image_attachments = [] + if 'metadata' in output: + user_key = f"user_{row_idx}" + if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]: + for attachment in output['metadata'][user_key]["attachments"]: + if attachment.get("type") == "image": + image_attachments.append(attachment) + + # Add image attachments to state for the generation + if image_attachments: + state['image_attachments'] = image_attachments + # Add web search results as attachments if enabled if state.get('enable_web_search', False): search_query = generate_search_query(text, state) diff --git a/modules/exllamav3.py b/modules/exllamav3.py new file mode 100644 index 00000000..c2532ec3 --- /dev/null +++ b/modules/exllamav3.py @@ -0,0 +1,313 @@ +import traceback +from pathlib import Path +from typing import Any, List, Tuple + +from exllamav3 import Cache, Config, Generator, Model, Tokenizer +from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant + +from extensions.openai.image_utils import ( + convert_image_attachments_to_pil, + convert_openai_messages_to_images +) +from modules import shared +from modules.logging_colors import logger + +try: + import flash_attn +except Exception: + logger.warning('Failed to load flash-attention due to the following error:\n') + traceback.print_exc() + + +class Exllamav3Model: + def __init__(self): + pass + + @classmethod + def from_pretrained(cls, path_to_model): + path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model) + + # Reset global MMTokenAllocator to prevent token ID corruption when switching models + from exllamav3.tokenizer.mm_embedding import ( + FIRST_MM_EMBEDDING_INDEX, + global_allocator + ) + global_allocator.next_token_index = FIRST_MM_EMBEDDING_INDEX + logger.info("Reset MMTokenAllocator for clean multimodal token allocation") + + config = Config.from_directory(str(path_to_model)) + model = Model.from_config(config) + + # Calculate the closest multiple of 256 at or above the chosen value + max_tokens = shared.args.ctx_size + if max_tokens % 256 != 0: + adjusted_tokens = ((max_tokens // 256) + 1) * 256 + logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}") + max_tokens = adjusted_tokens + + # Parse cache type (ExLlamaV2 pattern) + cache_type = shared.args.cache_type.lower() + cache_kwargs = {} + if cache_type == 'fp16': + layer_type = CacheLayer_fp16 + elif cache_type.startswith('q'): + layer_type = CacheLayer_quant + if '_' in cache_type: + # Different bits for k and v (e.g., q4_q8) + k_part, v_part = cache_type.split('_') + k_bits = int(k_part[1:]) + v_bits = int(v_part[1:]) + else: + # Same bits for k and v (e.g., q4) + k_bits = v_bits = int(cache_type[1:]) + + # Validate bit ranges + if not (2 <= k_bits <= 8 and 2 <= v_bits <= 8): + logger.warning(f"Invalid quantization bits: k_bits={k_bits}, v_bits={v_bits}. Must be between 2 and 8. Falling back to fp16.") + layer_type = CacheLayer_fp16 + else: + cache_kwargs = {'k_bits': k_bits, 'v_bits': v_bits} + else: + logger.warning(f"Unrecognized cache type: {cache_type}. Falling back to fp16.") + layer_type = CacheLayer_fp16 + + cache = Cache(model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs) + + load_params = {'progressbar': True} + if shared.args.gpu_split: + split = [float(alloc) for alloc in shared.args.gpu_split.split(",")] + load_params['use_per_device'] = split + + model.load(**load_params) + + tokenizer = Tokenizer.from_config(config) + + # Load vision model component (ExLlamaV3 native) + vision_model = None + try: + logger.info("Loading vision model component...") + vision_model = Model.from_config(config, component="vision") + vision_model.load(progressbar=True) + logger.info("Vision model loaded successfully") + except Exception as e: + logger.warning(f"Vision model loading failed (multimodal disabled): {e}") + + generator = Generator( + model=model, + cache=cache, + tokenizer=tokenizer, + ) + + result = cls() + result.model = model + result.cache = cache + result.tokenizer = tokenizer + result.generator = generator + result.config = config + result.max_tokens = max_tokens + result.vision_model = vision_model + + return result + + def is_multimodal(self) -> bool: + """Check if this model supports multimodal input.""" + return hasattr(self, 'vision_model') and self.vision_model is not None + + def _process_images_for_generation(self, prompt: str, state: dict) -> Tuple[str, List[Any]]: + """ + Process all possible image inputs and return modified prompt + embeddings. + Returns: (processed_prompt, image_embeddings) + """ + if not self.is_multimodal(): + return prompt, [] + + # Collect images from various sources using shared utilities + pil_images = [] + + # From webui image_attachments (preferred format) + if 'image_attachments' in state and state['image_attachments']: + pil_images.extend(convert_image_attachments_to_pil(state['image_attachments'])) + + # From OpenAI API raw_images + elif 'raw_images' in state and state['raw_images']: + pil_images.extend(state['raw_images']) + + # From OpenAI API messages format + elif 'messages' in state and state['messages']: + pil_images.extend(convert_openai_messages_to_images(state['messages'])) + + if not pil_images: + return prompt, [] + + # ExLlamaV3-specific: Generate embeddings + try: + # Use pre-computed embeddings if available (proper MMEmbedding lifetime) + if 'image_embeddings' in state and state['image_embeddings']: + # Use existing embeddings - this preserves MMEmbedding lifetime + image_embeddings = state['image_embeddings'] + else: + # Do not reset the cache/allocator index; it causes token ID conflicts during generation. + + logger.info(f"Processing {len(pil_images)} image(s) with ExLlamaV3 vision model") + image_embeddings = [ + self.vision_model.get_image_embeddings(tokenizer=self.tokenizer, image=img) + for img in pil_images + ] + + # ExLlamaV3-specific: Handle prompt processing with placeholders + placeholders = [ie.text_alias for ie in image_embeddings] + + if '<__media__>' in prompt: + # Web chat: Replace <__media__> placeholders + for alias in placeholders: + prompt = prompt.replace('<__media__>', alias, 1) + logger.info(f"Replaced {len(placeholders)} <__media__> placeholder(s)") + else: + # API: Prepend embedding aliases + combined_placeholders = "\n".join(placeholders) + prompt = combined_placeholders + "\n" + prompt + logger.info(f"Prepended {len(placeholders)} embedding(s) to prompt") + + return prompt, image_embeddings + + except Exception as e: + logger.error(f"Failed to process images: {e}") + return prompt, [] + + def generate_with_streaming(self, prompt, state): + """ + Generate text with streaming using native ExLlamaV3 API + """ + from exllamav3 import Job + from exllamav3.generator.sampler.presets import ComboSampler + + # Process images and modify prompt (ExLlamaV3-specific) + prompt, image_embeddings = self._process_images_for_generation(prompt, state) + + sampler = ComboSampler( + rep_p=state.get('repetition_penalty', 1.0), + freq_p=state.get('frequency_penalty', 0.0), + pres_p=state.get('presence_penalty', 0.0), + temperature=state.get('temperature', 0.7), + min_p=state.get('min_p', 0.0), + top_k=state.get('top_k', 0), + top_p=state.get('top_p', 1.0), + ) + + # Encode prompt with embeddings (ExLlamaV3-specific) + if image_embeddings: + input_ids = self.tokenizer.encode( + prompt, + encode_special_tokens=True, + embeddings=image_embeddings, + ) + else: + input_ids = self.tokenizer.encode(prompt, encode_special_tokens=True) + + # Get stop conditions from state (webui format) - keep as strings like ExLlamaV3 examples + stop_conditions = [] + if 'stopping_strings' in state and state['stopping_strings']: + # Use strings directly (ExLlamaV3 handles the conversion internally) + stop_conditions.extend(state['stopping_strings']) + + # Add EOS token ID as ExLlamaV3 examples do + if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None: + stop_conditions.append(self.tokenizer.eos_token_id) + + job = Job( + input_ids=input_ids, + max_new_tokens=state.get('max_new_tokens', 500), + decode_special_tokens=True, + embeddings=image_embeddings if image_embeddings else None, + sampler=sampler, + stop_conditions=stop_conditions if stop_conditions else None, + ) + + # Stream generation + self.generator.enqueue(job) + + response_text = "" + try: + while self.generator.num_remaining_jobs(): + results = self.generator.iterate() + for result in results: + if "eos" in result and result["eos"]: + break + + chunk = result.get("text", "") + if chunk: + response_text += chunk + yield response_text + finally: + # No cleanup needed. MMEmbedding lifetime is managed by Python. + # Cache and page table resets are unnecessary and can cause token ID conflicts. + pass + + def generate(self, prompt, state): + """ + Generate text using native ExLlamaV3 API (non-streaming) + """ + output = self.generator.generate( + prompt=prompt, + max_new_tokens=state.get('max_new_tokens', 500), + temperature=state.get('temperature', 0.7), + top_p=state.get('top_p', 1.0), + top_k=state.get('top_k', 0), + repetition_penalty=state.get('repetition_penalty', 1.0), + frequency_penalty=state.get('frequency_penalty', 0.0), + presence_penalty=state.get('presence_penalty', 0.0), + min_p=state.get('min_p', 0.0), + ) + + return output + + def encode(self, string, **kwargs): + return self.tokenizer.encode(string, **kwargs) + + def decode(self, ids, **kwargs): + return self.tokenizer.decode(ids, **kwargs) + + @property + def last_prompt_token_count(self): + # This would need to be tracked during generation + return 0 + + def unload(self): + logger.info("Unloading ExLlamaV3 model components...") + + if hasattr(self, 'vision_model') and self.vision_model is not None: + try: + del self.vision_model + except Exception as e: + logger.warning(f"Error unloading vision model: {e}") + self.vision_model = None + + if hasattr(self, 'model') and self.model is not None: + try: + self.model.unload() + del self.model + except Exception as e: + logger.warning(f"Error unloading main model: {e}") + self.model = None + + if hasattr(self, 'cache') and self.cache is not None: + self.cache = None + + if hasattr(self, 'generator') and self.generator is not None: + self.generator = None + + if hasattr(self, 'tokenizer') and self.tokenizer is not None: + self.tokenizer = None + + # Force GPU memory cleanup + import gc + + import torch + gc.collect() + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + torch.cuda.empty_cache() + + logger.info("ExLlamaV3 model fully unloaded") diff --git a/modules/html_generator.py b/modules/html_generator.py index 79237f7f..63a0cdd0 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -406,16 +406,27 @@ def format_message_attachments(history, role, index): for attachment in attachments: name = html.escape(attachment["name"]) - # Make clickable if URL exists - if "url" in attachment: - name = f'{name}' + if attachment.get("type") == "image": + # Show image preview + file_path = attachment.get("file_path", "") + attachments_html += ( + f'
' + f'{name}' + f'
{name}
' + f'
' + ) + else: + # Make clickable if URL exists (web search) + if "url" in attachment: + name = f'{name}' + + attachments_html += ( + f'
' + f'
{attachment_svg}
' + f'
{name}
' + f'
' + ) - attachments_html += ( - f'
' - f'
{attachment_svg}
' - f'
{name}
' - f'
' - ) attachments_html += '' return attachments_html diff --git a/modules/loaders.py b/modules/loaders.py index 7546bc5b..e9437c16 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -55,6 +55,11 @@ loaders_and_params = OrderedDict({ 'trust_remote_code', 'no_use_fast', ], + 'ExLlamav3': [ + 'ctx_size', + 'cache_type', + 'gpu_split', + ], 'ExLlamav2_HF': [ 'ctx_size', 'cache_type', @@ -248,6 +253,41 @@ loaders_samplers = { 'grammar_string', 'grammar_file_row', }, + 'ExLlamav3': { + 'temperature', + 'dynatemp_low', + 'dynatemp_high', + 'dynatemp_exponent', + 'smoothing_factor', + 'min_p', + 'top_p', + 'top_k', + 'typical_p', + 'xtc_threshold', + 'xtc_probability', + 'tfs', + 'top_a', + 'dry_multiplier', + 'dry_allowed_length', + 'dry_base', + 'repetition_penalty', + 'frequency_penalty', + 'presence_penalty', + 'repetition_penalty_range', + 'mirostat_mode', + 'mirostat_tau', + 'mirostat_eta', + 'dynamic_temperature', + 'temperature_last', + 'auto_max_new_tokens', + 'ban_eos_token', + 'add_bos_token', + 'enable_thinking', + 'skip_special_tokens', + 'seed', + 'custom_token_bans', + 'dry_sequence_breakers', + }, 'ExLlamav2': { 'temperature', 'dynatemp_low', diff --git a/modules/models.py b/modules/models.py index c1e7fb56..cc500a40 100644 --- a/modules/models.py +++ b/modules/models.py @@ -19,6 +19,7 @@ def load_model(model_name, loader=None): 'llama.cpp': llama_cpp_server_loader, 'Transformers': transformers_loader, 'ExLlamav3_HF': ExLlamav3_HF_loader, + 'ExLlamav3': ExLlamav3_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ExLlamav2': ExLlamav2_loader, 'TensorRT-LLM': TensorRT_LLM_loader, @@ -88,6 +89,14 @@ def ExLlamav3_HF_loader(model_name): return Exllamav3HF.from_pretrained(model_name) +def ExLlamav3_loader(model_name): + from modules.exllamav3 import Exllamav3Model + + model = Exllamav3Model.from_pretrained(model_name) + tokenizer = model.tokenizer + return model, tokenizer + + def ExLlamav2_HF_loader(model_name): from modules.exllamav2_hf import Exllamav2HF @@ -116,7 +125,9 @@ def unload_model(keep_model_name=False): return is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer') - if shared.model.__class__.__name__ == 'Exllamav3HF': + if shared.args.loader in ['ExLlamav3_HF', 'ExLlamav3']: + shared.model.unload() + elif shared.args.loader in ['ExLlamav2_HF', 'ExLlamav2'] and hasattr(shared.model, 'unload'): shared.model.unload() shared.model = shared.tokenizer = None diff --git a/modules/shared.py b/modules/shared.py index ab5198d1..1de4306b 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -318,6 +318,8 @@ def fix_loader_name(name): return 'ExLlamav2_HF' elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']: return 'ExLlamav3_HF' + elif name in ['exllamav3']: + return 'ExLlamav3' elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']: return 'TensorRT-LLM' diff --git a/modules/text_generation.py b/modules/text_generation.py index 8d1950b9..d6a87ce8 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -40,7 +40,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap yield '' return - if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel']: + if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']: generate_func = generate_reply_custom else: generate_func = generate_reply_HF @@ -128,9 +128,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt from modules.torch_utils import get_device - if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel']: + if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']: input_ids = shared.tokenizer.encode(str(prompt)) - if shared.model.__class__.__name__ != 'Exllamav2Model': + if shared.model.__class__.__name__ not in ['Exllamav2Model', 'Exllamav3Model']: input_ids = np.array(input_ids).reshape(1, len(input_ids)) else: input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens) @@ -148,7 +148,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu: + if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel'] or shared.args.cpu: return input_ids else: device = get_device() @@ -295,6 +295,8 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None, _StopEverythingStoppingCriteria ) + # Native ExLlamav3Model handles multimodal internally - no special routing needed + if shared.args.loader == 'Transformers': clear_torch_cache() diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 1d85a398..3b922fb4 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -54,7 +54,7 @@ def create_ui(): gr.HTML(value='
', elem_id='gr-hover') with gr.Column(scale=10, elem_id='chat-input-container'): - shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar']) + shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf', 'image'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar']) shared.gradio['typing-dots'] = gr.HTML(value='
', label='typing', elem_id='typing-container') with gr.Column(scale=1, elem_id='generate-stop-container'): From 6e9de75727ace45b3bf71ea3a98ef350b6d7414d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 8 Aug 2025 19:35:09 -0700 Subject: [PATCH 03/51] Support loading chat templates from chat_template.json files --- modules/models_settings.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 4e53dc81..729d5dd1 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -139,16 +139,26 @@ def get_model_metadata(model): with open(jinja_path, 'r', encoding='utf-8') as f: template = f.read() + # 2. If no .jinja file, try chat_template.json + if template is None: + json_template_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.json' + if json_template_path.exists(): + with open(json_template_path, 'r', encoding='utf-8') as f: + json_data = json.load(f) + if 'chat_template' in json_data: + template = json_data['chat_template'] + + # 3. Fall back to tokenizer_config.json metadata if path.exists(): metadata = json.loads(open(path, 'r', encoding='utf-8').read()) - # 2. Only read from metadata if we haven't already loaded from .jinja + # Only read from metadata if we haven't already loaded from .jinja or .json if template is None and 'chat_template' in metadata: template = metadata['chat_template'] if isinstance(template, list): template = template[0]['template'] - # 3. If a template was found from either source, process it + # 4. If a template was found from any source, process it if template: for k in ['eos_token', 'bos_token']: if k in metadata: From 8fcadff8d3120d1f3e844cd030d59a8c2b0b2dfd Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 8 Aug 2025 20:13:54 -0700 Subject: [PATCH 04/51] mtmd: Use the base64 attachment for the UI preview instead of the file --- modules/chat.py | 1 - modules/html_generator.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 354ae46b..98800239 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -640,7 +640,6 @@ def add_message_attachment(history, row_idx, file_path, is_user=True): "type": "image", "image_data": data_url, "image_id": image_id, - "file_path": str(path) # For UI preview } elif file_extension == '.pdf': # Process PDF file diff --git a/modules/html_generator.py b/modules/html_generator.py index 63a0cdd0..cb14a722 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -407,11 +407,10 @@ def format_message_attachments(history, role, index): name = html.escape(attachment["name"]) if attachment.get("type") == "image": - # Show image preview - file_path = attachment.get("file_path", "") + image_data = attachment.get("image_data", "") attachments_html += ( f'
' - f'{name}' + f'{name}' f'
{name}
' f'
' ) From 544c3a7c9f305b6a2141c3d02770250058d43322 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 8 Aug 2025 21:15:53 -0700 Subject: [PATCH 05/51] Polish the new exllamav3 loader --- modules/exllamav3.py | 152 +++++++++++++++++++++++++++++-------------- modules/loaders.py | 21 +----- 2 files changed, 104 insertions(+), 69 deletions(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index c2532ec3..295c2737 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -2,8 +2,22 @@ import traceback from pathlib import Path from typing import Any, List, Tuple +import torch from exllamav3 import Cache, Config, Generator, Model, Tokenizer from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant +from exllamav3.generator import Job +# Import the base sampler components directly from exllamav3 +from exllamav3.generator.sampler import ( + CustomSampler, + SS_Argmax, + SS_MinP, + SS_PresFreqP, + SS_RepP, + SS_Sample, + SS_Temperature, + SS_TopK, + SS_TopP +) from extensions.openai.image_utils import ( convert_image_attachments_to_pil, @@ -11,6 +25,7 @@ from extensions.openai.image_utils import ( ) from modules import shared from modules.logging_colors import logger +from modules.text_generation import get_max_prompt_length try: import flash_attn @@ -79,7 +94,6 @@ class Exllamav3Model: load_params['use_per_device'] = split model.load(**load_params) - tokenizer = Tokenizer.from_config(config) # Load vision model component (ExLlamaV3 native) @@ -127,11 +141,9 @@ class Exllamav3Model: # From webui image_attachments (preferred format) if 'image_attachments' in state and state['image_attachments']: pil_images.extend(convert_image_attachments_to_pil(state['image_attachments'])) - # From OpenAI API raw_images elif 'raw_images' in state and state['raw_images']: pil_images.extend(state['raw_images']) - # From OpenAI API messages format elif 'messages' in state and state['messages']: pil_images.extend(convert_openai_messages_to_images(state['messages'])) @@ -147,7 +159,6 @@ class Exllamav3Model: image_embeddings = state['image_embeddings'] else: # Do not reset the cache/allocator index; it causes token ID conflicts during generation. - logger.info(f"Processing {len(pil_images)} image(s) with ExLlamaV3 vision model") image_embeddings = [ self.vision_model.get_image_embeddings(tokenizer=self.tokenizer, image=img) @@ -178,46 +189,98 @@ class Exllamav3Model: """ Generate text with streaming using native ExLlamaV3 API """ - from exllamav3 import Job - from exllamav3.generator.sampler.presets import ComboSampler - # Process images and modify prompt (ExLlamaV3-specific) prompt, image_embeddings = self._process_images_for_generation(prompt, state) - sampler = ComboSampler( - rep_p=state.get('repetition_penalty', 1.0), - freq_p=state.get('frequency_penalty', 0.0), - pres_p=state.get('presence_penalty', 0.0), - temperature=state.get('temperature', 0.7), - min_p=state.get('min_p', 0.0), - top_k=state.get('top_k', 0), - top_p=state.get('top_p', 1.0), - ) + # -- Manually build and sort the sampler stack -- + # Greedy decoding is a special case + if state['temperature'] == 0: + sampler = CustomSampler([SS_Argmax()]) + else: + # 1. Create a list of all active, unordered samplers + unordered_samplers = [] + + # Penalties + penalty_range = state['repetition_penalty_range'] + if penalty_range <= 0: + penalty_range = -1 # ExllamaV3 uses -1 for whole context + rep_decay = 0 # Not a configurable parameter + + # Add penalty samplers if they are active + if state['repetition_penalty'] != 1.0: + unordered_samplers.append(SS_RepP(state['repetition_penalty'], penalty_range, rep_decay)) + if state['presence_penalty'] != 0.0 or state['frequency_penalty'] != 0.0: + unordered_samplers.append(SS_PresFreqP(state['presence_penalty'], state['frequency_penalty'], penalty_range, rep_decay)) + + # Standard samplers + if state['top_k'] > 0: + unordered_samplers.append(SS_TopK(state['top_k'])) + if state['top_p'] < 1.0: + unordered_samplers.append(SS_TopP(state['top_p'])) + if state['min_p'] > 0.0: + unordered_samplers.append(SS_MinP(state['min_p'])) + + # Temperature + unordered_samplers.append(SS_Temperature(state['temperature'])) + + # 2. Define the mapping from class names to the priority list keys + class_name_to_nickname = { + 'SS_RepP': 'repetition_penalty', + 'SS_PresFreqP': 'presence_frequency_penalty', + 'SS_TopK': 'top_k', + 'SS_TopP': 'top_p', + 'SS_MinP': 'min_p', + 'SS_Temperature': 'temperature', + } + + # 3. Get the priority list and handle temperature_last + default_priority = ['repetition_penalty', 'presence_frequency_penalty', 'top_k', 'top_p', 'min_p', 'temperature'] + sampler_priority = state.get('sampler_priority', default_priority) + + if state['temperature_last'] and 'temperature' in sampler_priority: + sampler_priority.append(sampler_priority.pop(sampler_priority.index('temperature'))) + + # 4. Sort the unordered list based on the priority list + def custom_sort_key(sampler_obj): + class_name = sampler_obj.__class__.__name__ + nickname = class_name_to_nickname.get(class_name) + if nickname in sampler_priority: + return sampler_priority.index(nickname) + return -1 + + ordered_samplers = sorted(unordered_samplers, key=custom_sort_key) + + # 5. Add the final sampling stage and build the sampler + ordered_samplers.append(SS_Sample()) + sampler = CustomSampler(ordered_samplers) + # -- End of sampler building -- # Encode prompt with embeddings (ExLlamaV3-specific) - if image_embeddings: - input_ids = self.tokenizer.encode( - prompt, - encode_special_tokens=True, - embeddings=image_embeddings, - ) + input_ids = self.tokenizer.encode( + prompt, + add_bos=state['add_bos_token'], + encode_special_tokens=True, + embeddings=image_embeddings, + ) + + input_ids = input_ids[:, -get_max_prompt_length(state):] + + # Determine max_new_tokens + if state['auto_max_new_tokens']: + max_new_tokens = state['truncation_length'] - input_ids.shape[-1] else: - input_ids = self.tokenizer.encode(prompt, encode_special_tokens=True) + max_new_tokens = state['max_new_tokens'] - # Get stop conditions from state (webui format) - keep as strings like ExLlamaV3 examples + # Get stop conditions stop_conditions = [] - if 'stopping_strings' in state and state['stopping_strings']: - # Use strings directly (ExLlamaV3 handles the conversion internally) - stop_conditions.extend(state['stopping_strings']) - - # Add EOS token ID as ExLlamaV3 examples do - if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None: - stop_conditions.append(self.tokenizer.eos_token_id) + if not state['ban_eos_token']: + if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None: + stop_conditions.append(self.tokenizer.eos_token_id) job = Job( input_ids=input_ids, - max_new_tokens=state.get('max_new_tokens', 500), - decode_special_tokens=True, + max_new_tokens=max_new_tokens, + decode_special_tokens=not state['skip_special_tokens'], embeddings=image_embeddings if image_embeddings else None, sampler=sampler, stop_conditions=stop_conditions if stop_conditions else None, @@ -244,25 +307,16 @@ class Exllamav3Model: pass def generate(self, prompt, state): - """ - Generate text using native ExLlamaV3 API (non-streaming) - """ - output = self.generator.generate( - prompt=prompt, - max_new_tokens=state.get('max_new_tokens', 500), - temperature=state.get('temperature', 0.7), - top_p=state.get('top_p', 1.0), - top_k=state.get('top_k', 0), - repetition_penalty=state.get('repetition_penalty', 1.0), - frequency_penalty=state.get('frequency_penalty', 0.0), - presence_penalty=state.get('presence_penalty', 0.0), - min_p=state.get('min_p', 0.0), - ) + output = "" + for chunk in self.generate_with_streaming(prompt, state): + output = chunk return output def encode(self, string, **kwargs): - return self.tokenizer.encode(string, **kwargs) + # Default add_bos to True for consistency with exllamav2 behavior + add_bos = kwargs.pop('add_bos', True) + return self.tokenizer.encode(string, add_bos=add_bos, **kwargs) def decode(self, ids, **kwargs): return self.tokenizer.decode(ids, **kwargs) @@ -301,8 +355,6 @@ class Exllamav3Model: # Force GPU memory cleanup import gc - - import torch gc.collect() if torch.cuda.is_available(): diff --git a/modules/loaders.py b/modules/loaders.py index e9437c16..151de990 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -255,38 +255,21 @@ loaders_samplers = { }, 'ExLlamav3': { 'temperature', - 'dynatemp_low', - 'dynatemp_high', - 'dynatemp_exponent', - 'smoothing_factor', 'min_p', 'top_p', 'top_k', - 'typical_p', - 'xtc_threshold', - 'xtc_probability', - 'tfs', - 'top_a', - 'dry_multiplier', - 'dry_allowed_length', - 'dry_base', 'repetition_penalty', 'frequency_penalty', 'presence_penalty', 'repetition_penalty_range', - 'mirostat_mode', - 'mirostat_tau', - 'mirostat_eta', - 'dynamic_temperature', 'temperature_last', + 'sampler_priority', 'auto_max_new_tokens', 'ban_eos_token', 'add_bos_token', 'enable_thinking', - 'skip_special_tokens', 'seed', - 'custom_token_bans', - 'dry_sequence_breakers', + 'skip_special_tokens', }, 'ExLlamav2': { 'temperature', From 9e260332cc9da24a407bb59aadf0cf6a9cf0d88c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 8 Aug 2025 21:22:47 -0700 Subject: [PATCH 06/51] Remove some unnecessary code --- modules/exllamav3.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 295c2737..d616d2f5 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -26,6 +26,7 @@ from extensions.openai.image_utils import ( from modules import shared from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length +from modules.torch_utils import clear_torch_cache try: import flash_attn @@ -342,6 +343,7 @@ class Exllamav3Model: del self.model except Exception as e: logger.warning(f"Error unloading main model: {e}") + self.model = None if hasattr(self, 'cache') and self.cache is not None: @@ -352,14 +354,3 @@ class Exllamav3Model: if hasattr(self, 'tokenizer') and self.tokenizer is not None: self.tokenizer = None - - # Force GPU memory cleanup - import gc - gc.collect() - - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.synchronize() - torch.cuda.empty_cache() - - logger.info("ExLlamaV3 model fully unloaded") From 1168004067dbe37791e6911fc3ed3386d8131ce3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:01:55 -0700 Subject: [PATCH 07/51] Minor change --- modules/exllamav3.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index d616d2f5..f6c56cb0 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -6,7 +6,6 @@ import torch from exllamav3 import Cache, Config, Generator, Model, Tokenizer from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant from exllamav3.generator import Job -# Import the base sampler components directly from exllamav3 from exllamav3.generator.sampler import ( CustomSampler, SS_Argmax, From 3f5ec9644f5aec2045126cdc5d962ee6f0b44c14 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:06:07 -0700 Subject: [PATCH 08/51] mtmd: Place the image <__media__> at the top of the prompt --- modules/chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 98800239..0a03a084 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -289,7 +289,7 @@ def generate_chat_prompt(user_input, state, **kwargs): if image_refs or attachments_text: enhanced_user_msg = user_msg if image_refs: - enhanced_user_msg += f" {image_refs}" + enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}" if attachments_text: enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}" From d9db8f63a719f799bac8f05ed567a1ba38041a72 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:25:42 -0700 Subject: [PATCH 09/51] mtmd: Simplifications --- extensions/openai/completions.py | 33 +++++++------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 3d389f0b..ff64527a 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -407,6 +407,10 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False): # Add messages to generate_params if present for multimodal processing if 'messages' in body: generate_params['messages'] = body['messages'] + raw_images = convert_openai_messages_to_images(generate_params['messages']) + if raw_images: + logger.info(f"Found {len(raw_images)} image(s) in request.") + generate_params['raw_images'] = raw_images if not stream: prompt_arg = body[prompt_str] @@ -423,7 +427,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False): total_prompt_token_count = 0 for idx, prompt in enumerate(prompt_arg, start=0): - if isinstance(prompt[0], int): + if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], int): # token lists if requested_model == shared.model_name: prompt = decode(prompt)[0] @@ -438,19 +442,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False): # generate reply ####################################### debug_msg({'prompt': prompt, 'generate_params': generate_params}) - - # Use multimodal generation if images are present - if 'messages' in generate_params: - raw_images = convert_openai_messages_to_images(generate_params['messages']) - if raw_images: - logger.info(f"Using multimodal generation for {len(raw_images)} images") - generate_params['raw_images'] = raw_images - generator = shared.model.generate_with_streaming(prompt, generate_params) - else: - generator = generate_reply(prompt, generate_params, is_chat=False) - else: - generator = generate_reply(prompt, generate_params, is_chat=False) - + generator = generate_reply(prompt, generate_params, is_chat=False) answer = '' for a in generator: @@ -523,18 +515,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False): # generate reply ####################################### debug_msg({'prompt': prompt, 'generate_params': generate_params}) - # Use multimodal generation if images are present - if 'messages' in generate_params: - raw_images = convert_openai_messages_to_images(generate_params['messages']) - if raw_images: - logger.info(f"Using multimodal generation for {len(raw_images)} images") - generate_params['raw_images'] = raw_images - generator = shared.model.generate_with_streaming(prompt, generate_params) - else: - generator = generate_reply(prompt, generate_params, is_chat=False) - else: - generator = generate_reply(prompt, generate_params, is_chat=False) - + generator = generate_reply(prompt, generate_params, is_chat=False) answer = '' seen_content = '' completion_token_count = 0 From fa9be444fa0b3e18763b1cd3d0dd07ad565ac1bb Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:26:59 -0700 Subject: [PATCH 10/51] Use ExLlamav3 instead of ExLlamav3_HF by default for EXL3 models --- modules/models_settings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 729d5dd1..d3bf4a36 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -211,11 +211,11 @@ def infer_loader(model_name, model_settings, hf_quant_method=None): elif re.match(r'.*\.gguf', model_name.lower()): loader = 'llama.cpp' elif hf_quant_method == 'exl3': - loader = 'ExLlamav3_HF' + loader = 'ExLlamav3' elif hf_quant_method in ['exl2', 'gptq']: loader = 'ExLlamav2_HF' elif re.match(r'.*exl3', model_name.lower()): - loader = 'ExLlamav3_HF' + loader = 'ExLlamav3' elif re.match(r'.*exl2', model_name.lower()): loader = 'ExLlamav2_HF' else: From f396b82a4f92f5823ed2a9bd1ff32d915da4cf9a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:31:36 -0700 Subject: [PATCH 11/51] mtmd: Better way to detect if an EXL3 model is multimodal --- modules/exllamav3.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index f6c56cb0..70f6c2f1 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -98,13 +98,16 @@ class Exllamav3Model: # Load vision model component (ExLlamaV3 native) vision_model = None - try: - logger.info("Loading vision model component...") - vision_model = Model.from_config(config, component="vision") - vision_model.load(progressbar=True) - logger.info("Vision model loaded successfully") - except Exception as e: - logger.warning(f"Vision model loading failed (multimodal disabled): {e}") + if "vision_config" in config.config_dict: + logger.info("Vision component detected in model config. Attempting to load...") + try: + vision_model = Model.from_config(config, component="vision") + vision_model.load(progressbar=True) + logger.info("Vision model loaded successfully.") + except Exception as e: + logger.warning(f"Vision model loading failed (multimodal disabled): {e}") + else: + logger.info("No vision component in model config. Skipping multimodal setup.") generator = Generator( model=model, From 59c6138e989861020816041821369fd9cd6b0ffa Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:32:15 -0700 Subject: [PATCH 12/51] Remove a log message --- modules/exllamav3.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 70f6c2f1..bdb0c760 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -48,7 +48,6 @@ class Exllamav3Model: global_allocator ) global_allocator.next_token_index = FIRST_MM_EMBEDDING_INDEX - logger.info("Reset MMTokenAllocator for clean multimodal token allocation") config = Config.from_directory(str(path_to_model)) model = Model.from_config(config) From 2fe79a93ccf21121db3dd076050df93f08c5bdb9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:50:24 -0700 Subject: [PATCH 13/51] mtmd: Handle another case after 3f5ec9644f5aec2045126cdc5d962ee6f0b44c14 --- modules/chat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 0a03a084..b127b489 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -326,7 +326,9 @@ def generate_chat_prompt(user_input, state, **kwargs): attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" if image_refs or attachments_text: - user_input = f"{user_input} {image_refs}" + user_input = user_input + if image_refs: + user_input = f"{image_refs}\n\n{user_input}" if attachments_text: user_input += f"\n\nATTACHMENTS:\n{attachments_text}" From a6d6bee88cf52dc4bbfaeac91df5a951810ab0dd Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 07:51:03 -0700 Subject: [PATCH 14/51] Change a comment --- modules/chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index b127b489..639feebf 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -813,7 +813,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess for file_path in files: add_message_attachment(output, row_idx, file_path, is_user=True) - # Collect image attachments for ExLlamaV3 + # Collect image attachments for multimodal generation image_attachments = [] if 'metadata' in output: user_key = f"user_{row_idx}" From d489eb589a698e37540861fe2951ef66efdb772d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 14:10:41 -0700 Subject: [PATCH 15/51] Attempt at fixing new exllamav3 loader undefined behavior when switching conversations --- modules/exllamav3.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index bdb0c760..e3a2d95a 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -304,9 +304,7 @@ class Exllamav3Model: response_text += chunk yield response_text finally: - # No cleanup needed. MMEmbedding lifetime is managed by Python. - # Cache and page table resets are unnecessary and can cause token ID conflicts. - pass + self.generator.clear_queue() def generate(self, prompt, state): output = "" From a289a92b9408e2542632ffa600ef57c373200aec Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 17:10:58 -0700 Subject: [PATCH 16/51] Fix exllamav3 token count --- modules/exllamav3.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index e3a2d95a..268a64ec 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -267,9 +267,11 @@ class Exllamav3Model: input_ids = input_ids[:, -get_max_prompt_length(state):] + self._last_prompt_token_count = input_ids.shape[-1] + # Determine max_new_tokens if state['auto_max_new_tokens']: - max_new_tokens = state['truncation_length'] - input_ids.shape[-1] + max_new_tokens = state['truncation_length'] - self._last_prompt_token_count else: max_new_tokens = state['max_new_tokens'] @@ -323,8 +325,7 @@ class Exllamav3Model: @property def last_prompt_token_count(self): - # This would need to be tracked during generation - return 0 + return getattr(self, '_last_prompt_token_count', 0) def unload(self): logger.info("Unloading ExLlamaV3 model components...") From eb16f6401794340c751e8105bc3837967b9e054e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 17:12:16 -0700 Subject: [PATCH 17/51] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_cuda128.txt | 4 ++-- requirements/full/requirements_cuda128_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 38 insertions(+), 38 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index f17cae8a..323ef0f9 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 51f4571f..2a7c9361 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 37021c77..0106fbea 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index f54ae191..d5db4a1c 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index e495455b..694f1ff8 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 72847534..392637e2 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index ed641a24..88eaa930 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index d7fe735b..6accc2f0 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index cb71f74b..3025f092 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index d6bed576..7394bdcf 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 1f17dc50..a095a4c7 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 82254842..ea43e56e 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 986a3d49..79737728 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 833e923b..d39786bd 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 6a894d49..0b373fa9 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 0afb19c2..fe9dccac 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index a404f50c..b3cfd525 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 75176656..02aa03e3 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From d86b0ec0103f24f27329a600d1cbaf3a5ea4c517 Mon Sep 17 00:00:00 2001 From: oobabooga Date: Sun, 10 Aug 2025 01:27:25 -0300 Subject: [PATCH 18/51] Add multimodal support (llama.cpp) (#7027) --- extensions/openai/image_utils.py | 9 ++++ modules/chat.py | 26 ++++++------ modules/llama_cpp_server.py | 46 ++++++++++++++++++--- modules/loaders.py | 2 + modules/shared.py | 1 + modules/ui.py | 1 + modules/ui_model_menu.py | 6 +++ modules/utils.py | 13 ++++++ user_data/mmproj/place-your-mmproj-here.txt | 0 9 files changed, 86 insertions(+), 18 deletions(-) create mode 100644 user_data/mmproj/place-your-mmproj-here.txt diff --git a/extensions/openai/image_utils.py b/extensions/openai/image_utils.py index c54f0532..658f00d7 100644 --- a/extensions/openai/image_utils.py +++ b/extensions/openai/image_utils.py @@ -11,6 +11,15 @@ from PIL import Image from modules.logging_colors import logger +def convert_pil_to_base64(image: Image.Image) -> str: + """Converts a PIL Image to a base64 encoded string.""" + buffered = io.BytesIO() + # Save image to an in-memory bytes buffer in PNG format + image.save(buffered, format="PNG") + # Encode the bytes to a base64 string + return base64.b64encode(buffered.getvalue()).decode('utf-8') + + def decode_base64_image(base64_string: str) -> Image.Image: """Decodes a base64 string to a PIL Image.""" try: diff --git a/modules/chat.py b/modules/chat.py index 639feebf..696fa350 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -813,19 +813,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess for file_path in files: add_message_attachment(output, row_idx, file_path, is_user=True) - # Collect image attachments for multimodal generation - image_attachments = [] - if 'metadata' in output: - user_key = f"user_{row_idx}" - if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]: - for attachment in output['metadata'][user_key]["attachments"]: - if attachment.get("type") == "image": - image_attachments.append(attachment) - - # Add image attachments to state for the generation - if image_attachments: - state['image_attachments'] = image_attachments - # Add web search results as attachments if enabled if state.get('enable_web_search', False): search_query = generate_search_query(text, state) @@ -881,6 +868,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess 'metadata': output['metadata'] } + # Collect image attachments for multimodal generation + image_attachments = [] + if 'metadata' in output: + user_key = f"user_{row_idx}" + if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]: + for attachment in output['metadata'][user_key]["attachments"]: + if attachment.get("type") == "image": + image_attachments.append(attachment) + + # Add image attachments to state for the generation + if image_attachments: + state['image_attachments'] = image_attachments + # Generate the prompt kwargs = { '_continue': _continue, diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index e64f1694..072ff83b 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -12,6 +12,10 @@ from pathlib import Path import llama_cpp_binaries import requests +from extensions.openai.image_utils import ( + convert_image_attachments_to_pil, + convert_pil_to_base64 +) from modules import shared from modules.logging_colors import logger @@ -128,15 +132,40 @@ class LlamaServer: url = f"http://127.0.0.1:{self.port}/completion" payload = self.prepare_payload(state) - token_ids = self.encode(prompt, add_bos_token=state["add_bos_token"]) - self.last_prompt_token_count = len(token_ids) + pil_images = [] + # Check for images from the Web UI (image_attachments) + if 'image_attachments' in state and state['image_attachments']: + pil_images.extend(convert_image_attachments_to_pil(state['image_attachments'])) + # Else, check for images from the API (raw_images) + elif 'raw_images' in state and state['raw_images']: + pil_images.extend(state.get('raw_images', [])) + + if pil_images: + # Multimodal case + IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image + + base64_images = [convert_pil_to_base64(img) for img in pil_images] + multimodal_prompt_object = { + "prompt": prompt, + "multimodal_data": base64_images + } + payload["prompt"] = multimodal_prompt_object + + # Calculate an estimated token count + text_tokens = self.encode(prompt, add_bos_token=state["add_bos_token"]) + self.last_prompt_token_count = len(text_tokens) + (len(pil_images) * IMAGE_TOKEN_COST_ESTIMATE) + else: + # Text only case + token_ids = self.encode(prompt, add_bos_token=state["add_bos_token"]) + self.last_prompt_token_count = len(token_ids) + payload["prompt"] = token_ids + if state['auto_max_new_tokens']: - max_new_tokens = state['truncation_length'] - len(token_ids) + max_new_tokens = state['truncation_length'] - self.last_prompt_token_count else: max_new_tokens = state['max_new_tokens'] payload.update({ - "prompt": token_ids, "n_predict": max_new_tokens, "stream": True, "cache_prompt": True @@ -144,7 +173,7 @@ class LlamaServer: if shared.args.verbose: logger.info("GENERATE_PARAMS=") - printable_payload = {k: v for k, v in payload.items() if k != "prompt"} + printable_payload = {k: (v if k != "prompt" else "[multimodal object]" if pil_images else v) for k, v in payload.items()} pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() @@ -295,6 +324,13 @@ class LlamaServer: cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)] if shared.args.rope_freq_base > 0: cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)] + if shared.args.mmproj not in [None, 'None']: + path = Path(shared.args.mmproj) + if not path.exists(): + path = Path('user_data/mmproj') / shared.args.mmproj + + if path.exists(): + cmd += ["--mmproj", str(path)] if shared.args.model_draft not in [None, 'None']: path = Path(shared.args.model_draft) if not path.exists(): diff --git a/modules/loaders.py b/modules/loaders.py index 151de990..feca9985 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -28,6 +28,8 @@ loaders_and_params = OrderedDict({ 'device_draft', 'ctx_size_draft', 'speculative_decoding_accordion', + 'mmproj', + 'mmproj_accordion', 'vram_info', ], 'Transformers': [ diff --git a/modules/shared.py b/modules/shared.py index 1de4306b..e9d8a62f 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -85,6 +85,7 @@ group.add_argument('--no-kv-offload', action='store_true', help='Do not offload group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.') group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"') group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') +group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.') # Cache group = parser.add_argument_group('Context and cache') diff --git a/modules/ui.py b/modules/ui.py index e7805046..1171cd48 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -167,6 +167,7 @@ def list_model_elements(): 'gpu_layers_draft', 'device_draft', 'ctx_size_draft', + 'mmproj', ] return elements diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 0ab67e7a..9fa8a4f4 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -59,6 +59,12 @@ def create_ui(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') + # Multimodal + with gr.Accordion("Multimodal (vision)", open=False, elem_classes='tgw-accordion') as shared.gradio['mmproj_accordion']: + with gr.Row(): + shared.gradio['mmproj'] = gr.Dropdown(label="mmproj file", choices=utils.get_available_mmproj(), value=lambda: shared.args.mmproj or 'None', elem_classes='slim-dropdown', info='Select a file that matches your model. Must be placed in user_data/mmproj/', interactive=not mu) + ui.create_refresh_button(shared.gradio['mmproj'], lambda: None, lambda: {'choices': utils.get_available_mmproj()}, 'refresh-button', interactive=not mu) + # Speculative decoding with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']: with gr.Row(): diff --git a/modules/utils.py b/modules/utils.py index 117ad590..4927ef04 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -154,6 +154,19 @@ def get_available_ggufs(): return sorted(model_list, key=natural_keys) +def get_available_mmproj(): + mmproj_dir = Path('user_data/mmproj') + if not mmproj_dir.exists(): + return ['None'] + + mmproj_files = [] + for item in mmproj_dir.iterdir(): + if item.is_file() and item.suffix.lower() in ('.gguf', '.bin'): + mmproj_files.append(item.name) + + return ['None'] + sorted(mmproj_files, key=natural_keys) + + def get_available_presets(): return sorted(set((k.stem for k in Path('user_data/presets').glob('*.yaml'))), key=natural_keys) diff --git a/user_data/mmproj/place-your-mmproj-here.txt b/user_data/mmproj/place-your-mmproj-here.txt new file mode 100644 index 00000000..e69de29b From c6b4d1e87f67dd990c66eaecb35c9cc70d0ae4e3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 21:33:12 -0700 Subject: [PATCH 19/51] Fix the exllamav2 loader ignoring add_bos --- modules/exllamav2.py | 3 ++- modules/exllamav3.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 6bb422ea..5d5c5b56 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -135,7 +135,8 @@ class Exllamav2Model: return result, result def encode(self, string, **kwargs): - return self.tokenizer.encode(string, add_bos=True, encode_special_tokens=True) + add_bos = kwargs.pop('add_bos', True) + return self.tokenizer.encode(string, add_bos=add_bos, encode_special_tokens=True, **kwargs) def decode(self, ids, **kwargs): if isinstance(ids, list): diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 268a64ec..9201801c 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -316,7 +316,6 @@ class Exllamav3Model: return output def encode(self, string, **kwargs): - # Default add_bos to True for consistency with exllamav2 behavior add_bos = kwargs.pop('add_bos', True) return self.tokenizer.encode(string, add_bos=add_bos, **kwargs) From 2f90ac98807a4ffa6a761bbcef5cf81a9de568b8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 21:41:38 -0700 Subject: [PATCH 20/51] Move the new image_utils.py file to modules/ --- extensions/openai/completions.py | 2 +- modules/exllamav3.py | 4 ++-- {extensions/openai => modules}/image_utils.py | 0 modules/llama_cpp_server.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) rename {extensions/openai => modules}/image_utils.py (100%) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index ff64527a..f4944060 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -7,7 +7,6 @@ import tiktoken from pydantic import ValidationError from extensions.openai.errors import InvalidRequestError -from extensions.openai.image_utils import convert_openai_messages_to_images from extensions.openai.typing import ToolDefinition from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall from modules import shared @@ -17,6 +16,7 @@ from modules.chat import ( load_character_memoized, load_instruction_template_memoized ) +from modules.image_utils import convert_openai_messages_to_images from modules.logging_colors import logger from modules.presets import load_preset_memoized from modules.text_generation import decode, encode, generate_reply diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 9201801c..9d597ce7 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -18,11 +18,11 @@ from exllamav3.generator.sampler import ( SS_TopP ) -from extensions.openai.image_utils import ( +from modules import shared +from modules.image_utils import ( convert_image_attachments_to_pil, convert_openai_messages_to_images ) -from modules import shared from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length from modules.torch_utils import clear_torch_cache diff --git a/extensions/openai/image_utils.py b/modules/image_utils.py similarity index 100% rename from extensions/openai/image_utils.py rename to modules/image_utils.py diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 072ff83b..3e8127ab 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -12,11 +12,11 @@ from pathlib import Path import llama_cpp_binaries import requests -from extensions.openai.image_utils import ( +from modules import shared +from modules.image_utils import ( convert_image_attachments_to_pil, convert_pil_to_base64 ) -from modules import shared from modules.logging_colors import logger llamacpp_valid_cache_types = {"fp16", "q8_0", "q4_0"} From 4663b1a56e99b8d637f9ac67c8b8d0e09d496ec7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 21:45:50 -0700 Subject: [PATCH 21/51] Update docs --- docs/12 - OpenAI API.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index b7b5fbc1..fc76cd8b 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -77,7 +77,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \ }' ``` -#### Multimodal support (ExLlamaV3) +#### Multimodal support (llama.cpp and ExLlamaV3) ```shell curl http://127.0.0.1:5000/v1/chat/completions \ From 0ea62d88f60689b44dd4ee42ae9ba0ff871a29c2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 9 Aug 2025 21:47:02 -0700 Subject: [PATCH 22/51] mtmd: Fix "continue" when an image is present --- modules/chat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 696fa350..42bb58a5 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -868,6 +868,8 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess 'metadata': output['metadata'] } + row_idx = len(output['internal']) - 1 + # Collect image attachments for multimodal generation image_attachments = [] if 'metadata' in output: @@ -895,7 +897,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess prompt = generate_chat_prompt(text, state, **kwargs) # Add timestamp for assistant's response at the start of generation - row_idx = len(output['internal']) - 1 update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp(), model_name=shared.model_name) # Generate From 1fb580785937ad42e8657a5fd894dfcd5a1fdeb3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 10 Aug 2025 06:54:44 -0700 Subject: [PATCH 23/51] mtmd: Fix API text completion when no images are sent --- extensions/openai/completions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index f4944060..6f4dfc29 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -405,7 +405,7 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False): echo = body['echo'] # Add messages to generate_params if present for multimodal processing - if 'messages' in body: + if body.get('messages'): generate_params['messages'] = body['messages'] raw_images = convert_openai_messages_to_images(generate_params['messages']) if raw_images: From 6fbf162d712cef876b128651fdebeb08a4f32538 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 10 Aug 2025 07:21:55 -0700 Subject: [PATCH 24/51] Default max_tokens to 512 in the API instead of 16 --- extensions/openai/typing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index e9f92da5..90366270 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -106,7 +106,7 @@ class CompletionRequestParams(BaseModel): frequency_penalty: float | None = 0 logit_bias: dict | None = None logprobs: int | None = None - max_tokens: int | None = 16 + max_tokens: int | None = 512 n: int | None = Field(default=1, description="Unused parameter.") presence_penalty: float | None = 0 stop: str | List[str] | None = None @@ -232,7 +232,7 @@ class LogitsRequestParams(BaseModel): use_samplers: bool = False top_logits: int | None = 50 frequency_penalty: float | None = 0 - max_tokens: int | None = 16 + max_tokens: int | None = 512 presence_penalty: float | None = 0 temperature: float | None = 1 top_p: float | None = 1 From cc964ee579463d4c3acb35c188ee8eb38e23ce1a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 10 Aug 2025 07:44:38 -0700 Subject: [PATCH 25/51] mtmd: Increase the size of the UI image preview --- css/main.css | 1 + 1 file changed, 1 insertion(+) diff --git a/css/main.css b/css/main.css index de16d81d..062d3eb2 100644 --- a/css/main.css +++ b/css/main.css @@ -1579,6 +1579,7 @@ strong { .image-attachment { flex-direction: column; + max-width: 314px; } .image-preview { From 9ec310d858824d5f9186bca277a3ab77ac556b75 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 10 Aug 2025 07:54:21 -0700 Subject: [PATCH 26/51] UI: Fix the color of italic text --- css/html_instruct_style.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 9831ee8f..3e5ebe67 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -13,7 +13,7 @@ line-height: 28px !important; } -.dark .chat .message-body :is(p, li, q, h1, h2, h3, h4, h5, h6) { +.dark .chat .message-body :is(p, li, q, em, h1, h2, h3, h4, h5, h6) { color: #d1d5db !important; } From c5340533c0b3a9edaea6c253f99250f09f2c26a5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 10 Aug 2025 20:39:04 -0700 Subject: [PATCH 27/51] mtmd: Add another API example --- docs/12 - OpenAI API.md | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index fc76cd8b..5dc98a51 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -77,7 +77,9 @@ curl http://127.0.0.1:5000/v1/chat/completions \ }' ``` -#### Multimodal support (llama.cpp and ExLlamaV3) +#### Multimodal/vision (llama.cpp and ExLlamaV3) + +##### /v1/chat/completions (recommended!) ```shell curl http://127.0.0.1:5000/v1/chat/completions \ @@ -87,7 +89,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \ { "role": "user", "content": [ - {"type": "text", "text": "What color is this image?"}, + {"type": "text", "text": "Please describe what you see in this image."}, {"type": "image_url", "image_url": {"url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true"}} ] } @@ -95,6 +97,38 @@ curl http://127.0.0.1:5000/v1/chat/completions \ }' ``` +##### /v1/completions + +```shell +curl http://127.0.0.1:5000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "About image <__media__> and image <__media__>, what I can say is that the first one" + }, + { + "type": "image_url", + "image_url": { + "url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/cat.png?raw=true" + } + }, + { + "type": "image_url", + "image_url": { + "url": "https://github.com/turboderp-org/exllamav3/blob/master/examples/media/strawberry.png?raw=true" + } + } + ] + } + ] + }' +``` + #### SSE streaming ```shell From 4d8dbbab648d14680741324d187bee23a8bea486 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 07:26:11 -0700 Subject: [PATCH 28/51] API: Fix sampler_priority usage for ExLlamaV3 --- modules/exllamav3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 9d597ce7..8f686669 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -237,7 +237,7 @@ class Exllamav3Model: # 3. Get the priority list and handle temperature_last default_priority = ['repetition_penalty', 'presence_frequency_penalty', 'top_k', 'top_p', 'min_p', 'temperature'] - sampler_priority = state.get('sampler_priority', default_priority) + sampler_priority = state.get('sampler_priority') or default_priority if state['temperature_last'] and 'temperature' in sampler_priority: sampler_priority.append(sampler_priority.pop(sampler_priority.index('temperature'))) From 4809ddfeb85e8b8d28bb617366c86fd8037815ee Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 07:35:22 -0700 Subject: [PATCH 29/51] Exllamav3: small sampler fixes --- modules/exllamav3.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 8f686669..5c142ec2 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -6,7 +6,9 @@ import torch from exllamav3 import Cache, Config, Generator, Model, Tokenizer from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant from exllamav3.generator import Job -from exllamav3.generator.sampler import ( + +from modules import shared +from modules.exllamav3_custom_sampler import ( CustomSampler, SS_Argmax, SS_MinP, @@ -17,8 +19,6 @@ from exllamav3.generator.sampler import ( SS_TopK, SS_TopP ) - -from modules import shared from modules.image_utils import ( convert_image_attachments_to_pil, convert_openai_messages_to_images @@ -194,7 +194,6 @@ class Exllamav3Model: # Process images and modify prompt (ExLlamaV3-specific) prompt, image_embeddings = self._process_images_for_generation(prompt, state) - # -- Manually build and sort the sampler stack -- # Greedy decoding is a special case if state['temperature'] == 0: sampler = CustomSampler([SS_Argmax()]) @@ -205,7 +204,7 @@ class Exllamav3Model: # Penalties penalty_range = state['repetition_penalty_range'] if penalty_range <= 0: - penalty_range = -1 # ExllamaV3 uses -1 for whole context + penalty_range = int(10e7) # Use large number for "full context" rep_decay = 0 # Not a configurable parameter # Add penalty samplers if they are active @@ -222,7 +221,7 @@ class Exllamav3Model: if state['min_p'] > 0.0: unordered_samplers.append(SS_MinP(state['min_p'])) - # Temperature + # Temperature (SS_NoOp is returned if temp is 1.0) unordered_samplers.append(SS_Temperature(state['temperature'])) # 2. Define the mapping from class names to the priority list keys @@ -246,7 +245,7 @@ class Exllamav3Model: def custom_sort_key(sampler_obj): class_name = sampler_obj.__class__.__name__ nickname = class_name_to_nickname.get(class_name) - if nickname in sampler_priority: + if nickname and nickname in sampler_priority: return sampler_priority.index(nickname) return -1 @@ -255,7 +254,6 @@ class Exllamav3Model: # 5. Add the final sampling stage and build the sampler ordered_samplers.append(SS_Sample()) sampler = CustomSampler(ordered_samplers) - # -- End of sampler building -- # Encode prompt with embeddings (ExLlamaV3-specific) input_ids = self.tokenizer.encode( From 1cb800d3923093e102470c6dde4e4a8b451e0a33 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 07:37:10 -0700 Subject: [PATCH 30/51] Docs: small change --- docs/12 - OpenAI API.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index 5dc98a51..fd3309c7 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -79,7 +79,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \ #### Multimodal/vision (llama.cpp and ExLlamaV3) -##### /v1/chat/completions (recommended!) +##### With /v1/chat/completions (recommended!) ```shell curl http://127.0.0.1:5000/v1/chat/completions \ @@ -97,7 +97,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \ }' ``` -##### /v1/completions +##### With /v1/completions ```shell curl http://127.0.0.1:5000/v1/completions \ From 52d1cbbbe95bed853241ab422b0558ab029d7d08 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 07:38:39 -0700 Subject: [PATCH 31/51] Fix an import --- modules/exllamav3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 5c142ec2..3fabdb6b 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -8,7 +8,7 @@ from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant from exllamav3.generator import Job from modules import shared -from modules.exllamav3_custom_sampler import ( +from exllamav3.generator.sampler import ( CustomSampler, SS_Argmax, SS_MinP, From 38c0b4a1adc613e9e3a237835faa1d88632733ef Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 07:39:53 -0700 Subject: [PATCH 32/51] Default ctx-size to 8192 when not found in the metadata --- modules/models_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index d3bf4a36..bf7b1cf9 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -15,7 +15,7 @@ from modules.logging_colors import logger def get_fallback_settings(): return { 'bf16': False, - 'ctx_size': 2048, + 'ctx_size': 8192, 'rope_freq_base': 0, 'compress_pos_emb': 1, 'alpha_value': 1, From b62c8845f34f3faac2481e368e6a4c67fd33fa59 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 08:22:17 -0700 Subject: [PATCH 33/51] mtmd: Fix /chat/completions for llama.cpp --- extensions/openai/completions.py | 18 +++++++++++++++--- modules/chat.py | 21 +++++++++++---------- modules/llama_cpp_server.py | 8 ++++++-- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 6f4dfc29..c3037d0c 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -85,16 +85,28 @@ def process_parameters(body, is_legacy=False): def process_multimodal_content(content): - """Extract text from OpenAI multimodal format for non-multimodal models""" + """Extract text and add image placeholders from OpenAI multimodal format""" if isinstance(content, str): return content if isinstance(content, list): text_parts = [] + image_placeholders = "" for item in content: - if isinstance(item, dict) and item.get('type') == 'text': + if not isinstance(item, dict): + continue + + item_type = item.get('type', '') + if item_type == 'text': text_parts.append(item.get('text', '')) - return ' '.join(text_parts) if text_parts else str(content) + elif item_type == 'image_url': + image_placeholders += "<__media__>" + + final_text = ' '.join(text_parts) + if image_placeholders: + return f"{image_placeholders}\n\n{final_text}" + else: + return final_text return str(content) diff --git a/modules/chat.py b/modules/chat.py index 42bb58a5..7b1629dd 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -870,18 +870,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess row_idx = len(output['internal']) - 1 - # Collect image attachments for multimodal generation - image_attachments = [] + # Collect image attachments for multimodal generation from the entire history + all_image_attachments = [] if 'metadata' in output: - user_key = f"user_{row_idx}" - if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]: - for attachment in output['metadata'][user_key]["attachments"]: - if attachment.get("type") == "image": - image_attachments.append(attachment) + for i in range(len(output['internal'])): + user_key = f"user_{i}" + if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]: + for attachment in output['metadata'][user_key]["attachments"]: + if attachment.get("type") == "image": + all_image_attachments.append(attachment) - # Add image attachments to state for the generation - if image_attachments: - state['image_attachments'] = image_attachments + # Add all collected image attachments to state for the generation + if all_image_attachments: + state['image_attachments'] = all_image_attachments # Generate the prompt kwargs = { diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 3e8127ab..63c8eda0 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -15,6 +15,7 @@ import requests from modules import shared from modules.image_utils import ( convert_image_attachments_to_pil, + convert_openai_messages_to_images, convert_pil_to_base64 ) from modules.logging_colors import logger @@ -133,10 +134,13 @@ class LlamaServer: payload = self.prepare_payload(state) pil_images = [] - # Check for images from the Web UI (image_attachments) + # Source 1: Web UI (from chatbot_wrapper) if 'image_attachments' in state and state['image_attachments']: pil_images.extend(convert_image_attachments_to_pil(state['image_attachments'])) - # Else, check for images from the API (raw_images) + # Source 2: Chat Completions API (/v1/chat/completions) + elif 'history' in state and state.get('history', {}).get('messages'): + pil_images.extend(convert_openai_messages_to_images(state['history']['messages'])) + # Source 3: Legacy Completions API (/v1/completions) elif 'raw_images' in state and state['raw_images']: pil_images.extend(state.get('raw_images', [])) From b10d525bf7618c415c88937e26c1a2240c3b2fcf Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 12:05:22 -0700 Subject: [PATCH 34/51] UI: Update a tooltip --- js/main.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/main.js b/js/main.js index e0f9314d..66a344b3 100644 --- a/js/main.js +++ b/js/main.js @@ -977,7 +977,7 @@ if (document.readyState === "loading") { //------------------------------------------------ // File upload button -document.querySelector("#chat-input .upload-button").title = "Upload text files, PDFs, and DOCX documents"; +document.querySelector("#chat-input .upload-button").title = "Upload text files, PDFs, DOCX documents, and images"; // Activate web search document.getElementById("web-search").title = "Search the internet with DuckDuckGo"; From 1ba1211ca027b887f160f6894a004b7d96ea0eee Mon Sep 17 00:00:00 2001 From: Mykeehu Date: Mon, 11 Aug 2025 21:13:56 +0200 Subject: [PATCH 35/51] Fix edit window and buttons in Messenger theme (#7100) --- css/chat_style-messenger.css | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css index 65af5f7a..583703c0 100644 --- a/css/chat_style-messenger.css +++ b/css/chat_style-messenger.css @@ -99,3 +99,9 @@ .message-body p em { color: rgb(110 110 110) !important; } +.editing-textarea { + width: max(30rem) !important; +} +.circle-you + .text .edit-control-button, .circle-you + .text .editing-textarea { + color: #000 !important; +} From 999471256c0626bb29e9caa65bbf96b8d2cb52d6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 12:32:17 -0700 Subject: [PATCH 36/51] Lint --- modules/exllamav2.py | 2 +- modules/exllamav3.py | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 5d5c5b56..3b3233d2 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -3,6 +3,7 @@ import traceback from pathlib import Path import torch + from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, @@ -15,7 +16,6 @@ from exllamav2 import ( ExLlamaV2Tokenizer ) from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator - from modules import shared from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 3fabdb6b..980230f8 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -2,12 +2,9 @@ import traceback from pathlib import Path from typing import Any, List, Tuple -import torch from exllamav3 import Cache, Config, Generator, Model, Tokenizer from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant from exllamav3.generator import Job - -from modules import shared from exllamav3.generator.sampler import ( CustomSampler, SS_Argmax, @@ -19,13 +16,13 @@ from exllamav3.generator.sampler import ( SS_TopK, SS_TopP ) +from modules import shared from modules.image_utils import ( convert_image_attachments_to_pil, convert_openai_messages_to_images ) from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length -from modules.torch_utils import clear_torch_cache try: import flash_attn @@ -205,13 +202,13 @@ class Exllamav3Model: penalty_range = state['repetition_penalty_range'] if penalty_range <= 0: penalty_range = int(10e7) # Use large number for "full context" - rep_decay = 0 # Not a configurable parameter + rep_decay = 0 # Not a configurable parameter # Add penalty samplers if they are active if state['repetition_penalty'] != 1.0: - unordered_samplers.append(SS_RepP(state['repetition_penalty'], penalty_range, rep_decay)) + unordered_samplers.append(SS_RepP(state['repetition_penalty'], penalty_range, rep_decay)) if state['presence_penalty'] != 0.0 or state['frequency_penalty'] != 0.0: - unordered_samplers.append(SS_PresFreqP(state['presence_penalty'], state['frequency_penalty'], penalty_range, rep_decay)) + unordered_samplers.append(SS_PresFreqP(state['presence_penalty'], state['frequency_penalty'], penalty_range, rep_decay)) # Standard samplers if state['top_k'] > 0: From a78ca6ffcdf0c53efdce8bfa6b37825590f5ae6e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 12:33:38 -0700 Subject: [PATCH 37/51] Remove a comment --- modules/text_generation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index d6a87ce8..27c5de7d 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -295,8 +295,6 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None, _StopEverythingStoppingCriteria ) - # Native ExLlamav3Model handles multimodal internally - no special routing needed - if shared.args.loader == 'Transformers': clear_torch_cache() From 765af1ba1736b209427232d5bec1b2e55b099e1b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 12:39:18 -0700 Subject: [PATCH 38/51] API: Improve a validation --- extensions/openai/typing.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index 90366270..56d91582 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -2,7 +2,7 @@ import json import time from typing import Dict, List, Optional -from pydantic import BaseModel, Field, field_validator, validator +from pydantic import BaseModel, Field, model_validator, validator class GenerationOptions(BaseModel): @@ -116,16 +116,11 @@ class CompletionRequestParams(BaseModel): top_p: float | None = 1 user: str | None = Field(default=None, description="Unused parameter.") - @field_validator('prompt', 'messages') - @classmethod - def validate_prompt_or_messages(cls, v, info): - """Ensure either 'prompt' or 'messages' is provided for completions.""" - if info.field_name == 'prompt': # If we're validating 'prompt', check if neither prompt nor messages will be set - messages = info.data.get('messages') - if v is None and messages is None: - raise ValueError("Either 'prompt' or 'messages' must be provided") - - return v + @model_validator(mode='after') + def validate_prompt_or_messages(self): + if self.prompt is None and self.messages is None: + raise ValueError("Either 'prompt' or 'messages' must be provided") + return self class CompletionRequest(GenerationOptions, CompletionRequestParams): From 1e3c4e8bdbc3e8d313bfab016bc6f1853c4ad4b7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 14:40:59 -0700 Subject: [PATCH 39/51] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_cuda128.txt | 4 ++-- requirements/full/requirements_cuda128_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 38 insertions(+), 38 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 323ef0f9..789539fc 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 2a7c9361..d7922478 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 0106fbea..2a3337a3 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index d5db4a1c..7287497d 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 694f1ff8..48ebe381 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 392637e2..ccf80d06 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 88eaa930..e819dd04 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 6accc2f0..8b9c882c 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 3025f092..ce81c5ff 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 7394bdcf..6233b84a 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index a095a4c7..e3a863ec 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index ea43e56e..26f813d2 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 79737728..4de1159d 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index d39786bd..fded9898 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 0b373fa9..013364ff 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index fe9dccac..85e95eb3 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index b3cfd525..945dcf49 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 02aa03e3..bf1eff03 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.34.0/llama_cpp_binaries-0.34.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 0e88a621fd96bc75b908d078972ab8117e957f55 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 15:16:03 -0700 Subject: [PATCH 40/51] UI: Better organize the right sidebar --- modules/ui_chat.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 3b922fb4..94c980bb 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -78,12 +78,19 @@ def create_ui(): with gr.Row(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) + gr.HTML("
") + shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.') shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.') + + gr.HTML("
") + shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search') with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']: shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10) + gr.HTML("
") + with gr.Row(): shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') @@ -93,6 +100,8 @@ def create_ui(): with gr.Row(): shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar']) + gr.HTML("
") + with gr.Row(): shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm') From 0e3def449a8bf71ab40c052e4206f612aeba0a60 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 15:17:25 -0700 Subject: [PATCH 41/51] llama.cpp: --swa-full to llama-server when streaming-llm is checked --- modules/llama_cpp_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 63c8eda0..58534f26 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -356,6 +356,7 @@ class LlamaServer: cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)] if shared.args.streaming_llm: cmd += ["--cache-reuse", "1"] + cmd += ["--swa-full"] if shared.args.extra_flags: # Clean up the input extra_flags = shared.args.extra_flags.strip() From c47e6deda279f27c7bff1a31351e72c0d5025052 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 16:20:20 -0700 Subject: [PATCH 42/51] Update README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 907d8c38..6e59f7da 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory. - 100% offline and private, with zero telemetry, external resources, or remote update requests. - **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents. +- **Vision (multimodal models)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal%E2Tutorial)). - **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation. - Aesthetic UI with dark and light themes. - Syntax highlighting for code blocks and LaTeX rendering for mathematical expressions. From e6447cd24acbde845dbb4aa27acfd4c17b5c849c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 17:42:35 -0700 Subject: [PATCH 43/51] mtmd: Update the llama-server request --- modules/llama_cpp_server.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 58534f26..e82edb90 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -149,11 +149,10 @@ class LlamaServer: IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image base64_images = [convert_pil_to_base64(img) for img in pil_images] - multimodal_prompt_object = { - "prompt": prompt, + payload["prompt"] = { + "prompt_string": prompt, "multimodal_data": base64_images } - payload["prompt"] = multimodal_prompt_object # Calculate an estimated token count text_tokens = self.encode(prompt, add_bos_token=state["add_bos_token"]) From d8fcc71616307a8ecacea93b7bdfa1117a23e1fe Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 11 Aug 2025 18:02:33 -0700 Subject: [PATCH 44/51] mtmd: Fail early if images are provided but the model doesn't support them (llama.cpp) --- modules/llama_cpp_server.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index e82edb90..51dacb84 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -34,6 +34,7 @@ class LlamaServer: self.process = None self.session = requests.Session() self.vocabulary_size = None + self.has_multimodal = False self.bos_token = "" self.last_prompt_token_count = 0 @@ -144,6 +145,10 @@ class LlamaServer: elif 'raw_images' in state and state['raw_images']: pil_images.extend(state.get('raw_images', [])) + # Fail early if images are provided but the model doesn't support them + if pil_images and not self.has_multimodal: + raise RuntimeError("The loaded llama.cpp model does not support multimodal requests. You must load a vision model and provide an mmproj file.") + if pil_images: # Multimodal case IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image @@ -261,8 +266,8 @@ class LlamaServer: else: raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}") - def _get_vocabulary_size(self): - """Get and store the model's maximum context length.""" + def _get_model_properties(self): + """Get and store the model's properties, including vocab size and multimodal capability.""" url = f"http://127.0.0.1:{self.port}/v1/models" response = self.session.get(url).json() @@ -271,6 +276,10 @@ class LlamaServer: if "meta" in model_info and "n_vocab" in model_info["meta"]: self.vocabulary_size = model_info["meta"]["n_vocab"] + # Check for multimodal capability + if "capabilities" in model_info and "multimodal" in model_info["capabilities"]: + self.has_multimodal = True + def _get_bos_token(self): """Get and store the model's BOS token.""" url = f"http://127.0.0.1:{self.port}/props" @@ -421,7 +430,7 @@ class LlamaServer: time.sleep(1) # Server is now healthy, get model info - self._get_vocabulary_size() + self._get_model_properties() self._get_bos_token() return self.port From 0882970a9445badcd953f27e4e10ecf869c103a5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 12 Aug 2025 07:00:24 -0700 Subject: [PATCH 45/51] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_cuda128.txt | 4 ++-- requirements/full/requirements_cuda128_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 38 insertions(+), 38 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 789539fc..eb7742b1 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index d7922478..47bcb60a 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 2a3337a3..6958ce37 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 7287497d..0890b2a5 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 48ebe381..da3010c6 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index ccf80d06..3a9a953b 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index e819dd04..a3e176d3 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 8b9c882c..807d0a21 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index ce81c5ff..41e96574 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 6233b84a..72ba7103 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index e3a863ec..0c7f1d29 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 26f813d2..09f1c502 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 4de1159d..75296cb4 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index fded9898..ff3d7cb1 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 013364ff..97414bde 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 85e95eb3..7f543205 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 945dcf49..c1764ead 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index bf1eff03..142b67ec 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.35.0/llama_cpp_binaries-0.35.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.36.0/llama_cpp_binaries-0.36.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 2238302b496a4145ee98e0eab0bf3d9f19a9c83b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 12 Aug 2025 08:50:45 -0700 Subject: [PATCH 46/51] ExLlamaV3: Add speculative decoding --- modules/exllamav3.py | 58 ++++++++++++++++++++++++++++++++++++++++++++ modules/loaders.py | 4 +++ 2 files changed, 62 insertions(+) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 980230f8..7fc6c5b1 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -85,6 +85,7 @@ class Exllamav3Model: cache = Cache(model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs) load_params = {'progressbar': True} + split = None if shared.args.gpu_split: split = [float(alloc) for alloc in shared.args.gpu_split.split(",")] load_params['use_per_device'] = split @@ -92,6 +93,45 @@ class Exllamav3Model: model.load(**load_params) tokenizer = Tokenizer.from_config(config) + # Initialize draft model for speculative decoding + draft_model = None + draft_cache = None + if shared.args.model_draft and shared.args.model_draft.lower() not in ["", "none"]: + logger.info(f"Loading draft model for speculative decoding: {shared.args.model_draft}") + + draft_path = Path(shared.args.model_draft) + if not draft_path.is_dir(): + draft_path = Path(f'{shared.args.model_dir}') / Path(shared.args.model_draft) + + if not draft_path.is_dir(): + logger.warning(f"Draft model not found at {draft_path}, speculative decoding disabled.") + else: + draft_config = Config.from_directory(str(draft_path)) + + # Set context size for draft model with 256-multiple validation + if shared.args.ctx_size_draft > 0: + draft_max_tokens = shared.args.ctx_size_draft + else: + draft_max_tokens = shared.args.ctx_size + + # Validate draft model context size is a multiple of 256 + if draft_max_tokens % 256 != 0: + adjusted_draft_tokens = ((draft_max_tokens // 256) + 1) * 256 + logger.warning(f"Draft model max_num_tokens must be a multiple of 256. Adjusting from {draft_max_tokens} to {adjusted_draft_tokens}") + draft_max_tokens = adjusted_draft_tokens + + draft_config.max_seq_len = draft_max_tokens + + draft_model = Model.from_config(draft_config) + draft_cache = Cache(draft_model, max_num_tokens=draft_max_tokens, layer_type=layer_type, **cache_kwargs) + + draft_load_params = {'progressbar': True} + if split: + draft_load_params['use_per_device'] = split + + draft_model.load(**draft_load_params) + logger.info(f"Draft model loaded successfully. Max speculative tokens: {shared.args.draft_max}") + # Load vision model component (ExLlamaV3 native) vision_model = None if "vision_config" in config.config_dict: @@ -109,6 +149,9 @@ class Exllamav3Model: model=model, cache=cache, tokenizer=tokenizer, + draft_model=draft_model, + draft_cache=draft_cache, + num_speculative_tokens=shared.args.draft_max if draft_model is not None else 0, ) result = cls() @@ -119,6 +162,8 @@ class Exllamav3Model: result.config = config result.max_tokens = max_tokens result.vision_model = vision_model + result.draft_model = draft_model + result.draft_cache = draft_cache return result @@ -289,6 +334,7 @@ class Exllamav3Model: self.generator.enqueue(job) response_text = "" + try: while self.generator.num_remaining_jobs(): results = self.generator.iterate() @@ -300,6 +346,7 @@ class Exllamav3Model: if chunk: response_text += chunk yield response_text + finally: self.generator.clear_queue() @@ -331,6 +378,17 @@ class Exllamav3Model: logger.warning(f"Error unloading vision model: {e}") self.vision_model = None + if hasattr(self, 'draft_model') and self.draft_model is not None: + try: + self.draft_model.unload() + del self.draft_model + except Exception as e: + logger.warning(f"Error unloading draft model: {e}") + self.draft_model = None + + if hasattr(self, 'draft_cache') and self.draft_cache is not None: + self.draft_cache = None + if hasattr(self, 'model') and self.model is not None: try: self.model.unload() diff --git a/modules/loaders.py b/modules/loaders.py index feca9985..8b7e6cce 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -61,6 +61,10 @@ loaders_and_params = OrderedDict({ 'ctx_size', 'cache_type', 'gpu_split', + 'model_draft', + 'draft_max', + 'ctx_size_draft', + 'speculative_decoding_accordion', ], 'ExLlamav2_HF': [ 'ctx_size', From 2f6a629393afdb33e7fd355be10f6c72185412af Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 12 Aug 2025 08:51:01 -0700 Subject: [PATCH 47/51] UI: Minor improvement after 0e88a621fd96bc75b908d078972ab8117e957f55 --- js/main.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/main.js b/js/main.js index 66a344b3..4b4b14c2 100644 --- a/js/main.js +++ b/js/main.js @@ -583,7 +583,7 @@ function moveToChatTab() { const chatControlsFirstChild = document.querySelector("#chat-controls").firstElementChild; const newParent = chatControlsFirstChild; - let newPosition = newParent.children.length - 2; + let newPosition = newParent.children.length - 3; newParent.insertBefore(grandParent, newParent.children[newPosition]); document.getElementById("save-character").style.display = "none"; From 8d7b88106a34102863a491a9c8848871c5118a85 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 12 Aug 2025 13:20:16 -0700 Subject: [PATCH 48/51] Revert "mtmd: Fail early if images are provided but the model doesn't support them (llama.cpp)" This reverts commit d8fcc71616307a8ecacea93b7bdfa1117a23e1fe. --- modules/llama_cpp_server.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 51dacb84..e82edb90 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -34,7 +34,6 @@ class LlamaServer: self.process = None self.session = requests.Session() self.vocabulary_size = None - self.has_multimodal = False self.bos_token = "" self.last_prompt_token_count = 0 @@ -145,10 +144,6 @@ class LlamaServer: elif 'raw_images' in state and state['raw_images']: pil_images.extend(state.get('raw_images', [])) - # Fail early if images are provided but the model doesn't support them - if pil_images and not self.has_multimodal: - raise RuntimeError("The loaded llama.cpp model does not support multimodal requests. You must load a vision model and provide an mmproj file.") - if pil_images: # Multimodal case IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image @@ -266,8 +261,8 @@ class LlamaServer: else: raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}") - def _get_model_properties(self): - """Get and store the model's properties, including vocab size and multimodal capability.""" + def _get_vocabulary_size(self): + """Get and store the model's maximum context length.""" url = f"http://127.0.0.1:{self.port}/v1/models" response = self.session.get(url).json() @@ -276,10 +271,6 @@ class LlamaServer: if "meta" in model_info and "n_vocab" in model_info["meta"]: self.vocabulary_size = model_info["meta"]["n_vocab"] - # Check for multimodal capability - if "capabilities" in model_info and "multimodal" in model_info["capabilities"]: - self.has_multimodal = True - def _get_bos_token(self): """Get and store the model's BOS token.""" url = f"http://127.0.0.1:{self.port}/props" @@ -430,7 +421,7 @@ class LlamaServer: time.sleep(1) # Server is now healthy, get model info - self._get_model_properties() + self._get_vocabulary_size() self._get_bos_token() return self.port From 7301452b4183efab97de71dae27486874a3d73f6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 12 Aug 2025 13:23:24 -0700 Subject: [PATCH 49/51] UI: Minor info message change --- modules/ui_model_menu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 9fa8a4f4..6972a17e 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -42,7 +42,7 @@ def create_ui(): with gr.Row(): with gr.Column(): shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') - shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') From 2f979ce2942efc82ad90dfc28c7407c473da5169 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 12 Aug 2025 13:33:49 -0700 Subject: [PATCH 50/51] docs: Add a multimodal tutorial --- docs/Multimodal Tutorial.md | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 docs/Multimodal Tutorial.md diff --git a/docs/Multimodal Tutorial.md b/docs/Multimodal Tutorial.md new file mode 100644 index 00000000..a30889f7 --- /dev/null +++ b/docs/Multimodal Tutorial.md @@ -0,0 +1,66 @@ +## Getting started + +### 1. Find a multimodal model + +GGUF models with vision capabilities are uploaded along a `mmproj` file to Hugging Face. + +For instance, [unsloth/gemma-3-4b-it-GGUF](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/tree/main) has this: + +print1 + +### 2. Download the model to `user_data/models` + +As an example, download + +https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/resolve/main/gemma-3-4b-it-Q4_K_S.gguf?download=true + +to your `text-generation-webui/user_data/models` folder. + +### 3. Download the associated mmproj file to `user_data/mmproj` + +Then download + +https://huggingface.co/unsloth/gemma-3-4b-it-GGUF/resolve/main/mmproj-F16.gguf?download=true + +to your `text-generation-webui/user_data/mmproj` folder. Name it `mmproj-gemma-3-4b-it-F16.gguf` to give it a recognizable name. + +### 4. Load the model + +1. Launch the web UI +2. Navigate to the Model tab +3. Select the GGUF model in the Model dropdown: + +print2 + +4. Select the mmproj file in the Multimodal (vision) menu: + +print3 + +5. Click "Load" + +### 5. Send a message with an image + +Select your image by clicking on the 📎 icon and send your message: + +print5 + +The model will reply with great understanding of the image contents: + +print6 + +## Multimodal with ExLlamaV3 + +Multimodal also works with the ExLlamaV3 loader (the non-HF one). + +No additional files are necessary, just load a multimodal EXL3 model and send an image. + +Examples of models that you can use: + +- https://huggingface.co/turboderp/gemma-3-27b-it-exl3 +- https://huggingface.co/turboderp/Mistral-Small-3.1-24B-Instruct-2503-exl3 + +## Multimodal API examples + +In the page below you can find some ready-to-use examples: + +[Multimodal/vision (llama.cpp and ExLlamaV3)](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#multimodalvision-llamacpp-and-exllamav3) From 41b95e9ec3dada8a931abb1a1ca974529d12d177 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 12 Aug 2025 13:37:37 -0700 Subject: [PATCH 51/51] Lint --- modules/exllamav2.py | 2 +- modules/exllamav3.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 3b3233d2..5d5c5b56 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -3,7 +3,6 @@ import traceback from pathlib import Path import torch - from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, @@ -16,6 +15,7 @@ from exllamav2 import ( ExLlamaV2Tokenizer ) from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator + from modules import shared from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 7fc6c5b1..66e25693 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -16,6 +16,7 @@ from exllamav3.generator.sampler import ( SS_TopK, SS_TopP ) + from modules import shared from modules.image_utils import ( convert_image_attachments_to_pil,