mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-03-18 03:14:39 +01:00
API: Stream reasoning_content separately from content in OpenAI-compatible responses
This commit is contained in:
parent
accb2ef661
commit
09a6549816
|
|
@ -417,7 +417,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
|||
logprob_proc.token_alternatives_history.clear()
|
||||
chat_logprobs_offset = [0] # mutable for closure access in streaming
|
||||
|
||||
def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False):
|
||||
def chat_streaming_chunk(content=None, chunk_tool_calls=None, include_role=False, reasoning_content=None):
|
||||
# begin streaming
|
||||
delta = {}
|
||||
if include_role:
|
||||
|
|
@ -425,6 +425,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
|||
delta['refusal'] = None
|
||||
if content is not None:
|
||||
delta['content'] = content
|
||||
if reasoning_content is not None:
|
||||
delta['reasoning_content'] = reasoning_content
|
||||
if chunk_tool_calls:
|
||||
delta['tool_calls'] = chunk_tool_calls
|
||||
|
||||
|
|
@ -477,6 +479,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
|||
|
||||
answer = ''
|
||||
seen_content = ''
|
||||
seen_reasoning = ''
|
||||
|
||||
tool_calls = []
|
||||
end_last_tool_call = 0
|
||||
|
|
@ -508,17 +511,31 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
|||
break
|
||||
|
||||
if stream:
|
||||
len_seen = len(seen_content)
|
||||
new_content = answer[len_seen:]
|
||||
# Strip reasoning/thinking blocks so only final content is streamed.
|
||||
# Reasoning is emitted separately as reasoning_content deltas.
|
||||
reasoning, content = extract_reasoning(answer)
|
||||
if reasoning is not None:
|
||||
new_reasoning = reasoning[len(seen_reasoning):]
|
||||
new_content = content[len(seen_content):]
|
||||
else:
|
||||
new_reasoning = None
|
||||
new_content = answer[len(seen_content):]
|
||||
|
||||
if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
|
||||
if (not new_content and not new_reasoning) or chr(0xfffd) in (new_content or '') + (new_reasoning or ''):
|
||||
continue
|
||||
|
||||
chunk = chat_streaming_chunk(new_content)
|
||||
chunk = chat_streaming_chunk(
|
||||
content=new_content if new_content else None,
|
||||
reasoning_content=new_reasoning if new_reasoning else None,
|
||||
)
|
||||
if include_usage:
|
||||
chunk['usage'] = None
|
||||
|
||||
seen_content = answer
|
||||
if reasoning is not None:
|
||||
seen_reasoning = reasoning
|
||||
seen_content = content
|
||||
else:
|
||||
seen_content = answer
|
||||
yield chunk
|
||||
|
||||
token_count = shared.model.last_prompt_token_count if hasattr(shared.model, 'last_prompt_token_count') else 0
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ THINKING_FORMATS = [
|
|||
('<|channel|>commentary<|message|>', '<|end|>', '<|start|>assistant<|channel|>final<|message|>'),
|
||||
('<seed:think>', '</seed:think>', None),
|
||||
('<|think|>', '<|end|>', '<|content|>'), # Solar Open
|
||||
('Thinking Process:', '</think>', None), # Qwen3.5 verbose thinking outside tags
|
||||
# ('Thinking Process:', '</think>', None), # Qwen3.5 verbose thinking outside tags -- removed: too prone to false positives in streaming
|
||||
(None, '</think>', None), # End-only variant (e.g., Qwen3-next)
|
||||
]
|
||||
|
||||
|
|
@ -42,6 +42,12 @@ def extract_reasoning(text, html_escaped=False):
|
|||
start_esc = esc(start_tag)
|
||||
start_pos = text.find(start_esc)
|
||||
if start_pos == -1:
|
||||
# During streaming, the start tag may be arriving partially.
|
||||
# If the text is a prefix of a start tag, return empty content
|
||||
# to prevent the partial tag from leaking.
|
||||
stripped = text.strip()
|
||||
if stripped and start_esc.startswith(stripped):
|
||||
return '', ''
|
||||
continue
|
||||
thought_start = start_pos + len(start_esc)
|
||||
end_pos = text.find(end_esc, thought_start)
|
||||
|
|
@ -63,7 +69,13 @@ def extract_reasoning(text, html_escaped=False):
|
|||
thought_end = end_pos
|
||||
if content_esc:
|
||||
content_pos = text.find(content_esc, end_pos)
|
||||
content_start = content_pos + len(content_esc) if content_pos != -1 else end_pos + len(end_esc)
|
||||
if content_pos != -1:
|
||||
content_start = content_pos + len(content_esc)
|
||||
else:
|
||||
# Content tag expected but not yet present (e.g. partial
|
||||
# streaming) — suppress intermediate tags between end_tag
|
||||
# and content_tag so they don't leak as content.
|
||||
content_start = len(text)
|
||||
else:
|
||||
content_start = end_pos + len(end_esc)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue