diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 27defe42..51427050 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -417,7 +417,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p logprob_proc.token_alternatives_history.clear() chat_logprobs_offset = [0] # mutable for closure access in streaming - def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False): + def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False, reasoning_content=None): # begin streaming delta = {} if include_role: @@ -425,6 +425,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p delta['refusal'] = None if content is not None: delta['content'] = content + if reasoning_content is not None: + delta['reasoning_content'] = reasoning_content if chunk_tool_calls: delta['tool_calls'] = chunk_tool_calls @@ -477,6 +479,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p answer = '' seen_content = '' + seen_reasoning = '' tool_calls = [] end_last_tool_call = 0 @@ -508,17 +511,31 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p break if stream: - len_seen = len(seen_content) - new_content = answer[len_seen:] + # Strip reasoning/thinking blocks so only final content is streamed. + # Reasoning is emitted separately as reasoning_content deltas. + reasoning, content = extract_reasoning(answer) + if reasoning is not None: + new_reasoning = reasoning[len(seen_reasoning):] + new_content = content[len(seen_content):] + else: + new_reasoning = None + new_content = answer[len(seen_content):] - if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet. + if (not new_content and not new_reasoning) or chr(0xfffd) in (new_content or '') + (new_reasoning or ''): continue - chunk = chat_streaming_chunk(new_content) + chunk = chat_streaming_chunk( + content=new_content if new_content else None, + reasoning_content=new_reasoning if new_reasoning else None, + ) if include_usage: chunk['usage'] = None - seen_content = answer + if reasoning is not None: + seen_reasoning = reasoning + seen_content = content + else: + seen_content = answer yield chunk token_count = shared.model.last_prompt_token_count if hasattr(shared.model, 'last_prompt_token_count') else 0 diff --git a/modules/reasoning.py b/modules/reasoning.py index 708ee55a..3a9ab546 100644 --- a/modules/reasoning.py +++ b/modules/reasoning.py @@ -8,7 +8,7 @@ THINKING_FORMATS = [ ('<|channel|>commentary<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'), ('', '', None), ('<|think|>', '<|end|>', '<|content|>'), # Solar Open - ('Thinking Process:', '', None), # Qwen3.5 verbose thinking outside tags + # ('Thinking Process:', '', None), # Qwen3.5 verbose thinking outside tags -- removed: too prone to false positives in streaming (None, '', None), # End-only variant (e.g., Qwen3-next) ] @@ -42,6 +42,12 @@ def extract_reasoning(text, html_escaped=False): start_esc = esc(start_tag) start_pos = text.find(start_esc) if start_pos == -1: + # During streaming, the start tag may be arriving partially. + # If the text is a prefix of a start tag, return empty content + # to prevent the partial tag from leaking. + stripped = text.strip() + if stripped and start_esc.startswith(stripped): + return '', '' continue thought_start = start_pos + len(start_esc) end_pos = text.find(end_esc, thought_start) @@ -63,7 +69,13 @@ def extract_reasoning(text, html_escaped=False): thought_end = end_pos if content_esc: content_pos = text.find(content_esc, end_pos) - content_start = content_pos + len(content_esc) if content_pos != -1 else end_pos + len(end_esc) + if content_pos != -1: + content_start = content_pos + len(content_esc) + else: + # Content tag expected but not yet present (e.g. partial + # streaming) — suppress intermediate tags between end_tag + # and content_tag so they don't leak as content. + content_start = len(text) else: content_start = end_pos + len(end_esc)