From 8d7b88106a34102863a491a9c8848871c5118a85 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:20:16 -0700
Subject: [PATCH] Revert "mtmd: Fail early if images are provided but the model
 doesn't support them (llama.cpp)"

This reverts commit d8fcc71616307a8ecacea93b7bdfa1117a23e1fe.
---
 modules/llama_cpp_server.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 51dacb84..e82edb90 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -34,7 +34,6 @@ class LlamaServer:
         self.process = None
         self.session = requests.Session()
         self.vocabulary_size = None
-        self.has_multimodal = False
         self.bos_token = "<s>"
         self.last_prompt_token_count = 0
 
@@ -145,10 +144,6 @@ class LlamaServer:
         elif 'raw_images' in state and state['raw_images']:
             pil_images.extend(state.get('raw_images', []))
 
-        # Fail early if images are provided but the model doesn't support them
-        if pil_images and not self.has_multimodal:
-            raise RuntimeError("The loaded llama.cpp model does not support multimodal requests. You must load a vision model and provide an mmproj file.")
-
         if pil_images:
             # Multimodal case
             IMAGE_TOKEN_COST_ESTIMATE = 600  # A safe, conservative estimate per image
@@ -266,8 +261,8 @@ class LlamaServer:
         else:
             raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
 
-    def _get_model_properties(self):
-        """Get and store the model's properties, including vocab size and multimodal capability."""
+    def _get_vocabulary_size(self):
+        """Get and store the model's maximum context length."""
         url = f"http://127.0.0.1:{self.port}/v1/models"
         response = self.session.get(url).json()
 
@@ -276,10 +271,6 @@ class LlamaServer:
             if "meta" in model_info and "n_vocab" in model_info["meta"]:
                 self.vocabulary_size = model_info["meta"]["n_vocab"]
 
-            # Check for multimodal capability
-            if "capabilities" in model_info and "multimodal" in model_info["capabilities"]:
-                self.has_multimodal = True
-
     def _get_bos_token(self):
         """Get and store the model's BOS token."""
         url = f"http://127.0.0.1:{self.port}/props"
@@ -430,7 +421,7 @@ class LlamaServer:
             time.sleep(1)
 
         # Server is now healthy, get model info
-        self._get_model_properties()
+        self._get_vocabulary_size()
         self._get_bos_token()
         return self.port