diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 6f4dfc29..c3037d0c 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -85,16 +85,28 @@ def process_parameters(body, is_legacy=False): def process_multimodal_content(content): - """Extract text from OpenAI multimodal format for non-multimodal models""" + """Extract text and add image placeholders from OpenAI multimodal format""" if isinstance(content, str): return content if isinstance(content, list): text_parts = [] + image_placeholders = "" for item in content: - if isinstance(item, dict) and item.get('type') == 'text': + if not isinstance(item, dict): + continue + + item_type = item.get('type', '') + if item_type == 'text': text_parts.append(item.get('text', '')) - return ' '.join(text_parts) if text_parts else str(content) + elif item_type == 'image_url': + image_placeholders += "<__media__>" + + final_text = ' '.join(text_parts) + if image_placeholders: + return f"{image_placeholders}\n\n{final_text}" + else: + return final_text return str(content) diff --git a/modules/chat.py b/modules/chat.py index 42bb58a5..7b1629dd 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -870,18 +870,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess row_idx = len(output['internal']) - 1 - # Collect image attachments for multimodal generation - image_attachments = [] + # Collect image attachments for multimodal generation from the entire history + all_image_attachments = [] if 'metadata' in output: - user_key = f"user_{row_idx}" - if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]: - for attachment in output['metadata'][user_key]["attachments"]: - if attachment.get("type") == "image": - image_attachments.append(attachment) + for i in range(len(output['internal'])): + user_key = f"user_{i}" + if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]: + for attachment in output['metadata'][user_key]["attachments"]: + if attachment.get("type") == "image": + all_image_attachments.append(attachment) - # Add image attachments to state for the generation - if image_attachments: - state['image_attachments'] = image_attachments + # Add all collected image attachments to state for the generation + if all_image_attachments: + state['image_attachments'] = all_image_attachments # Generate the prompt kwargs = { diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 3e8127ab..63c8eda0 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -15,6 +15,7 @@ import requests from modules import shared from modules.image_utils import ( convert_image_attachments_to_pil, + convert_openai_messages_to_images, convert_pil_to_base64 ) from modules.logging_colors import logger @@ -133,10 +134,13 @@ class LlamaServer: payload = self.prepare_payload(state) pil_images = [] - # Check for images from the Web UI (image_attachments) + # Source 1: Web UI (from chatbot_wrapper) if 'image_attachments' in state and state['image_attachments']: pil_images.extend(convert_image_attachments_to_pil(state['image_attachments'])) - # Else, check for images from the API (raw_images) + # Source 2: Chat Completions API (/v1/chat/completions) + elif 'history' in state and state.get('history', {}).get('messages'): + pil_images.extend(convert_openai_messages_to_images(state['history']['messages'])) + # Source 3: Legacy Completions API (/v1/completions) elif 'raw_images' in state and state['raw_images']: pil_images.extend(state.get('raw_images', []))