API: Stream reasoning_content separately from content in OpenAI-compatible responses

2026-04-20 22:13:43 +00:00 · 2026-03-14 06:52:40 -07:00 · 2026-03-14 06:52:40 -07:00 · 09a6549816
commit 09a6549816
parent accb2ef661
2 changed files with 37 additions and 8 deletions
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@ -417,7 +417,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
        logprob_proc.token_alternatives_history.clear()
    chat_logprobs_offset = [0]  # mutable for closure access in streaming

-    def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False):
+    def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False, reasoning_content=None):
        # begin streaming
        delta = {}
        if include_role:
@ -425,6 +425,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
            delta['refusal'] = None
        if content is not None:
            delta['content'] = content
+        if reasoning_content is not None:
+            delta['reasoning_content'] = reasoning_content
        if chunk_tool_calls:
            delta['tool_calls'] = chunk_tool_calls

@ -477,6 +479,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p

    answer = ''
    seen_content = ''
+    seen_reasoning = ''

    tool_calls = []
    end_last_tool_call = 0
@ -508,17 +511,31 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
            break

        if stream:
-            len_seen = len(seen_content)
-            new_content = answer[len_seen:]
+            # Strip reasoning/thinking blocks so only final content is streamed.
+            # Reasoning is emitted separately as reasoning_content deltas.
+            reasoning, content = extract_reasoning(answer)
+            if reasoning is not None:
+                new_reasoning = reasoning[len(seen_reasoning):]
+                new_content = content[len(seen_content):]
+            else:
+                new_reasoning = None
+                new_content = answer[len(seen_content):]

-            if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
+            if (not new_content and not new_reasoning) or chr(0xfffd) in (new_content or '') + (new_reasoning or ''):
                continue

-            chunk = chat_streaming_chunk(new_content)
+            chunk = chat_streaming_chunk(
+                content=new_content if new_content else None,
+                reasoning_content=new_reasoning if new_reasoning else None,
+            )
            if include_usage:
                chunk['usage'] = None

-            seen_content = answer
+            if reasoning is not None:
+                seen_reasoning = reasoning
+                seen_content = content
+            else:
+                seen_content = answer
            yield chunk

    token_count = shared.model.last_prompt_token_count if hasattr(shared.model, 'last_prompt_token_count') else 0
--- a/modules/reasoning.py
+++ b/modules/reasoning.py
@ -8,7 +8,7 @@ THINKING_FORMATS = [
    ('<|channel|>commentary<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'),
    ('<seed:think>', '</seed:think>', None),
    ('<|think|>', '<|end|>', '<|content|>'),  # Solar Open
-    ('Thinking Process:', '</think>', None),  # Qwen3.5 verbose thinking outside tags
+    # ('Thinking Process:', '</think>', None),  # Qwen3.5 verbose thinking outside tags -- removed: too prone to false positives in streaming
    (None, '</think>', None),  # End-only variant (e.g., Qwen3-next)
 ]

@ -42,6 +42,12 @@ def extract_reasoning(text, html_escaped=False):
            start_esc = esc(start_tag)
            start_pos = text.find(start_esc)
            if start_pos == -1:
+                # During streaming, the start tag may be arriving partially.
+                # If the text is a prefix of a start tag, return empty content
+                # to prevent the partial tag from leaking.
+                stripped = text.strip()
+                if stripped and start_esc.startswith(stripped):
+                    return '', ''
                continue
            thought_start = start_pos + len(start_esc)
            end_pos = text.find(end_esc, thought_start)
@ -63,7 +69,13 @@ def extract_reasoning(text, html_escaped=False):
            thought_end = end_pos
            if content_esc:
                content_pos = text.find(content_esc, end_pos)
-                content_start = content_pos + len(content_esc) if content_pos != -1 else end_pos + len(end_esc)
+                if content_pos != -1:
+                    content_start = content_pos + len(content_esc)
+                else:
+                    # Content tag expected but not yet present (e.g. partial
+                    # streaming) — suppress intermediate tags between end_tag
+                    # and content_tag so they don't leak as content.
+                    content_start = len(text)
            else:
                content_start = end_pos + len(end_esc)