From d0ac58ad3135776ec6e108f82a5c64a9662cc122 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 5 Mar 2026 21:25:03 -0800
Subject: [PATCH] API: Fix tool_calls placement and other response
 compatibility issues

---
 docs/12 - OpenAI API.md          | 4 ++--
 extensions/openai/completions.py | 5 ++---
 extensions/openai/script.py      | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index ebe24797..637ccced 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -456,11 +456,11 @@ for _ in range(10):
         messages.append({
             "role": "assistant",
             "content": choice["message"]["content"],
-            "tool_calls": choice["tool_calls"],
+            "tool_calls": choice["message"]["tool_calls"],
         })
 
         # Execute each tool and add results to history
-        for tool_call in choice["tool_calls"]:
+        for tool_call in choice["message"]["tool_calls"]:
             name = tool_call["function"]["name"]
             arguments = json.loads(tool_call["function"]["arguments"])
             result = execute_tool(name, arguments)
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 5187343f..1538b87a 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -370,8 +370,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             resp_list: [{
                 "index": 0,
                 "finish_reason": stop_reason,
-                "message": {"role": "assistant", "content": answer},
-                "tool_calls": tool_calls
+                "message": {"role": "assistant", "content": answer, "tool_calls": tool_calls},
             }],
             "usage": {
                 "prompt_tokens": token_count,
@@ -389,7 +388,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
 
 
 def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_event=None):
-    object_type = 'text_completion.chunk' if stream else 'text_completion'
+    object_type = 'text_completion'
     created_time = int(time.time())
     cmpl_id = "conv-%d" % (int(time.time() * 1000000000))
     resp_list = 'data' if is_legacy else 'choices'
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 521d2cb4..bfb6fd54 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -359,7 +359,7 @@ async def handle_load_model(request_data: LoadModelRequest):
         return JSONResponse(content="OK")
     except Exception:
         traceback.print_exc()
-        return HTTPException(status_code=400, detail="Failed to load the model.")
+        raise HTTPException(status_code=400, detail="Failed to load the model.")
 
 
 @app.post("/v1/internal/model/unload", dependencies=check_admin_key)
@@ -380,7 +380,7 @@ async def handle_load_loras(request_data: LoadLorasRequest):
         return JSONResponse(content="OK")
     except Exception:
         traceback.print_exc()
-        return HTTPException(status_code=400, detail="Failed to apply the LoRA(s).")
+        raise HTTPException(status_code=400, detail="Failed to apply the LoRA(s).")
 
 
 @app.post("/v1/internal/lora/unload", dependencies=check_admin_key)