Set multimodal status during Model Loading (#7199)

2026-03-15 10:03:51 +01:00 · 2025-08-13 15:47:27 -04:00 · 2025-08-13 15:47:27 -04:00 · 57f6e9af5a
parent 725a8bcf60
commit 57f6e9af5a
4 changed files with 30 additions and 9 deletions
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@ -177,9 +177,6 @@ class Exllamav3Model:
        Process all possible image inputs and return modified prompt + embeddings.
        Returns: (processed_prompt, image_embeddings)
        """
-        if not self.is_multimodal():
-            return prompt, []
-
        # Collect images from various sources using shared utilities
        pil_images = []

@ -234,8 +231,11 @@ class Exllamav3Model:
        """
        Generate text with streaming using native ExLlamaV3 API
        """
-        # Process images and modify prompt (ExLlamaV3-specific)
-        prompt, image_embeddings = self._process_images_for_generation(prompt, state)
+        image_embeddings = []
+
+        if shared.is_multimodal:
+            # Process images and modify prompt (ExLlamaV3-specific)
+            prompt, image_embeddings = self._process_images_for_generation(prompt, state)

        # Greedy decoding is a special case
        if state['temperature'] == 0:
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -8,6 +8,7 @@ import sys
 import threading
 import time
 from pathlib import Path
+from typing import Any, List

 import llama_cpp_binaries
 import requests
@ -129,10 +130,10 @@ class LlamaServer:

        return payload

-    def generate_with_streaming(self, prompt, state):
-        url = f"http://127.0.0.1:{self.port}/completion"
-        payload = self.prepare_payload(state)
-
+    def _process_images_for_generation(self, state: dict) -> List[Any]:
+        """
+        Process all possible image inputs and return PIL images
+        """
        pil_images = []
        # Source 1: Web UI (from chatbot_wrapper)
        if 'image_attachments' in state and state['image_attachments']:
@ -144,6 +145,21 @@ class LlamaServer:
        elif 'raw_images' in state and state['raw_images']:
            pil_images.extend(state.get('raw_images', []))

+        return pil_images
+
+    def is_multimodal(self) -> bool:
+        """Check if this model supports multimodal input."""
+        return shared.args.mmproj not in [None, 'None']
+
+    def generate_with_streaming(self, prompt, state):
+        url = f"http://127.0.0.1:{self.port}/completion"
+        payload = self.prepare_payload(state)
+
+        pil_images = []
+
+        if shared.is_multimodal:
+            pil_images = self._process_images_for_generation(state)
+
        if pil_images:
            # Multimodal case
            IMAGE_TOKEN_COST_ESTIMATE = 600  # A safe, conservative estimate per image
--- a/modules/models.py
+++ b/modules/models.py
@ -55,6 +55,10 @@ def load_model(model_name, loader=None):
    if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp':
        shared.settings['truncation_length'] = shared.args.ctx_size

+    shared.is_multimodal = False    
+    if loader.lower() in ('exllamav3', 'llama.cpp'):
+        shared.is_multimodal = model.is_multimodal()
+
    logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
    logger.info(f"LOADER: \"{loader}\"")
    logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
--- a/modules/shared.py
+++ b/modules/shared.py
@ -16,6 +16,7 @@ model = None
 tokenizer = None
 model_name = 'None'
 is_seq2seq = False
+is_multimodal = False
 model_dirty_from_training = False
 lora_names = []