diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index e82edb90..51dacb84 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -34,6 +34,7 @@ class LlamaServer: self.process = None self.session = requests.Session() self.vocabulary_size = None + self.has_multimodal = False self.bos_token = "" self.last_prompt_token_count = 0 @@ -144,6 +145,10 @@ class LlamaServer: elif 'raw_images' in state and state['raw_images']: pil_images.extend(state.get('raw_images', [])) + # Fail early if images are provided but the model doesn't support them + if pil_images and not self.has_multimodal: + raise RuntimeError("The loaded llama.cpp model does not support multimodal requests. You must load a vision model and provide an mmproj file.") + if pil_images: # Multimodal case IMAGE_TOKEN_COST_ESTIMATE = 600 # A safe, conservative estimate per image @@ -261,8 +266,8 @@ class LlamaServer: else: raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}") - def _get_vocabulary_size(self): - """Get and store the model's maximum context length.""" + def _get_model_properties(self): + """Get and store the model's properties, including vocab size and multimodal capability.""" url = f"http://127.0.0.1:{self.port}/v1/models" response = self.session.get(url).json() @@ -271,6 +276,10 @@ class LlamaServer: if "meta" in model_info and "n_vocab" in model_info["meta"]: self.vocabulary_size = model_info["meta"]["n_vocab"] + # Check for multimodal capability + if "capabilities" in model_info and "multimodal" in model_info["capabilities"]: + self.has_multimodal = True + def _get_bos_token(self): """Get and store the model's BOS token.""" url = f"http://127.0.0.1:{self.port}/props" @@ -421,7 +430,7 @@ class LlamaServer: time.sleep(1) # Server is now healthy, get model info - self._get_vocabulary_size() + self._get_model_properties() self._get_bos_token() return self.port