API: Stream reasoning_content separately from content in OpenAI-compatible responses

This commit is contained in:
oobabooga 2026-03-14 06:52:40 -07:00
parent accb2ef661
commit 09a6549816
2 changed files with 37 additions and 8 deletions

View file

@ -417,7 +417,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
logprob_proc.token_alternatives_history.clear()
chat_logprobs_offset = [0] # mutable for closure access in streaming
def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False):
def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False, reasoning_content=None):
# begin streaming
delta = {}
if include_role:
@ -425,6 +425,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
delta['refusal'] = None
if content is not None:
delta['content'] = content
if reasoning_content is not None:
delta['reasoning_content'] = reasoning_content
if chunk_tool_calls:
delta['tool_calls'] = chunk_tool_calls
@ -477,6 +479,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
answer = ''
seen_content = ''
seen_reasoning = ''
tool_calls = []
end_last_tool_call = 0
@ -508,17 +511,31 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
break
if stream:
len_seen = len(seen_content)
new_content = answer[len_seen:]
# Strip reasoning/thinking blocks so only final content is streamed.
# Reasoning is emitted separately as reasoning_content deltas.
reasoning, content = extract_reasoning(answer)
if reasoning is not None:
new_reasoning = reasoning[len(seen_reasoning):]
new_content = content[len(seen_content):]
else:
new_reasoning = None
new_content = answer[len(seen_content):]
if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
if (not new_content and not new_reasoning) or chr(0xfffd) in (new_content or '') + (new_reasoning or ''):
continue
chunk = chat_streaming_chunk(new_content)
chunk = chat_streaming_chunk(
content=new_content if new_content else None,
reasoning_content=new_reasoning if new_reasoning else None,
)
if include_usage:
chunk['usage'] = None
seen_content = answer
if reasoning is not None:
seen_reasoning = reasoning
seen_content = content
else:
seen_content = answer
yield chunk
token_count = shared.model.last_prompt_token_count if hasattr(shared.model, 'last_prompt_token_count') else 0

View file

@ -8,7 +8,7 @@ THINKING_FORMATS = [
('<|channel|>commentary<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'),
('<seed:think>', '</seed:think>', None),
('<|think|>', '<|end|>', '<|content|>'), # Solar Open
('Thinking Process:', '</think>', None), # Qwen3.5 verbose thinking outside tags
# ('Thinking Process:', '</think>', None), # Qwen3.5 verbose thinking outside tags -- removed: too prone to false positives in streaming
(None, '</think>', None), # End-only variant (e.g., Qwen3-next)
]
@ -42,6 +42,12 @@ def extract_reasoning(text, html_escaped=False):
start_esc = esc(start_tag)
start_pos = text.find(start_esc)
if start_pos == -1:
# During streaming, the start tag may be arriving partially.
# If the text is a prefix of a start tag, return empty content
# to prevent the partial tag from leaking.
stripped = text.strip()
if stripped and start_esc.startswith(stripped):
return '', ''
continue
thought_start = start_pos + len(start_esc)
end_pos = text.find(end_esc, thought_start)
@ -63,7 +69,13 @@ def extract_reasoning(text, html_escaped=False):
thought_end = end_pos
if content_esc:
content_pos = text.find(content_esc, end_pos)
content_start = content_pos + len(content_esc) if content_pos != -1 else end_pos + len(end_esc)
if content_pos != -1:
content_start = content_pos + len(content_esc)
else:
# Content tag expected but not yet present (e.g. partial
# streaming) — suppress intermediate tags between end_tag
# and content_tag so they don't leak as content.
content_start = len(text)
else:
content_start = end_pos + len(end_esc)