From bf6fbc019dbd9470efdeafa033818efa178d7735 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 20 Mar 2026 14:46:00 -0300
Subject: [PATCH] API: Move OpenAI-compatible API from extensions/openai to
 modules/api

---
 .../workflows/build-portable-release-cuda.yml |  2 +-
 .../workflows/build-portable-release-rocm.yml |  2 +-
 .../build-portable-release-vulkan.yml         |  2 +-
 .github/workflows/build-portable-release.yml  |  2 +-
 docs/07 - Extensions.md                       |  1 -
 docs/12 - OpenAI API.md                       | 12 +------
 modules/api/__init__.py                       |  0
 .../api}/cache_embedding_model.py             |  0
 .../openai => modules/api}/completions.py     |  6 ++--
 .../openai => modules/api}/embeddings.py      | 10 +++---
 {extensions/openai => modules/api}/errors.py  |  0
 {extensions/openai => modules/api}/images.py  |  2 +-
 {extensions/openai => modules/api}/logits.py  |  2 +-
 {extensions/openai => modules/api}/models.py  |  0
 .../openai => modules/api}/moderations.py     |  2 +-
 {extensions/openai => modules/api}/script.py  | 34 ++++++++++---------
 {extensions/openai => modules/api}/tokens.py  |  0
 {extensions/openai => modules/api}/typing.py  |  0
 {extensions/openai => modules/api}/utils.py   |  3 +-
 modules/extensions.py                         |  3 +-
 modules/shared.py                             | 16 +--------
 modules/ui_session.py                         |  2 --
 server.py                                     | 15 ++++++++
 23 files changed, 51 insertions(+), 65 deletions(-)
 create mode 100644 modules/api/__init__.py
 rename {extensions/openai => modules/api}/cache_embedding_model.py (100%)
 rename {extensions/openai => modules/api}/completions.py (99%)
 rename {extensions/openai => modules/api}/embeddings.py (90%)
 rename {extensions/openai => modules/api}/errors.py (100%)
 rename {extensions/openai => modules/api}/images.py (96%)
 rename {extensions/openai => modules/api}/logits.py (84%)
 rename {extensions/openai => modules/api}/models.py (100%)
 rename {extensions/openai => modules/api}/moderations.py (97%)
 rename {extensions/openai => modules/api}/script.py (96%)
 rename {extensions/openai => modules/api}/tokens.py (100%)
 rename {extensions/openai => modules/api}/typing.py (100%)
 rename {extensions/openai => modules/api}/utils.py (93%)

diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml
index 5d66bd77..f9eea58a 100644
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@@ -106,7 +106,7 @@ jobs:
             cd "text-generation-webui-${VERSION_CLEAN}"
 
             # Remove extensions that need additional requirements
-            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            allowed=("character_bias" "gallery" "sd_api_pictures")
             find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
diff --git a/.github/workflows/build-portable-release-rocm.yml b/.github/workflows/build-portable-release-rocm.yml
index b9a10bac..db42b7dc 100644
--- a/.github/workflows/build-portable-release-rocm.yml
+++ b/.github/workflows/build-portable-release-rocm.yml
@@ -105,7 +105,7 @@ jobs:
             cd "text-generation-webui-${VERSION_CLEAN}"
 
             # Remove extensions that need additional requirements
-            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            allowed=("character_bias" "gallery" "sd_api_pictures")
             find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml
index 9748d5b8..8f5aa7c8 100644
--- a/.github/workflows/build-portable-release-vulkan.yml
+++ b/.github/workflows/build-portable-release-vulkan.yml
@@ -105,7 +105,7 @@ jobs:
             cd "text-generation-webui-${VERSION_CLEAN}"
 
             # Remove extensions that need additional requirements
-            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            allowed=("character_bias" "gallery" "sd_api_pictures")
             find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index e03116f6..9ace90f6 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -105,7 +105,7 @@ jobs:
             cd "text-generation-webui-${VERSION_CLEAN}"
 
             # Remove extensions that need additional requirements
-            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            allowed=("character_bias" "gallery" "sd_api_pictures")
             find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
diff --git a/docs/07 - Extensions.md b/docs/07 - Extensions.md
index 48cd30ce..779b2a34 100644
--- a/docs/07 - Extensions.md	
+++ b/docs/07 - Extensions.md	
@@ -20,7 +20,6 @@ If you create an extension, you are welcome to host it in a GitHub repository an
 
 |Extension|Description|
 |---------|-----------|
-|[openai](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai)| Creates an API that mimics the OpenAI API and can be used as a drop-in replacement. |
 |[superboogav2](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/superboogav2)| Enhanced RAG extension with support for PDF, DOCX, and PPTX files. |
 |[send_pictures](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/send_pictures/)| Creates an image upload field that can be used to send images to the bot in chat mode. Captions are automatically generated using BLIP. |
 |[coqui_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/coqui_tts)| Text-to-speech extension using Coqui XTTS v2. |
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 637ccced..276a7e19 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -19,7 +19,7 @@ Add `--api` to your command-line flags.
 
 ### Examples
 
-For the documentation with all the endpoints, parameters and their types, consult `http://127.0.0.1:5000/docs` or the [typing.py](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/openai/typing.py) file.
+For the documentation with all the endpoints, parameters and their types, consult `http://127.0.0.1:5000/docs` or the [typing.py](https://github.com/oobabooga/text-generation-webui/blob/main/modules/api/typing.py) file.
 
 The official examples in the [OpenAI documentation](https://platform.openai.com/docs/api-reference) should also work, and the same parameters apply (although the API here has more optional parameters).
 
@@ -490,16 +490,6 @@ The following environment variables can be used (they take precedence over every
 | `OPENEDAI_EMBEDDING_MODEL` | Embedding model (if applicable) |          sentence-transformers/all-mpnet-base-v2                  |
 | `OPENEDAI_EMBEDDING_DEVICE` | Embedding device (if applicable) |           cuda                 |
 
-#### Persistent settings with `settings.yaml`
-
-You can also set the following variables in your `settings.yaml` file:
-
-```
-openai-embedding_device: cuda
-openai-embedding_model: "sentence-transformers/all-mpnet-base-v2"
-openai-debug: 1
-```
-
 ### Third-party application setup
 
 You can usually force an application that uses the OpenAI API to connect to the local API by using the following environment variables:
diff --git a/modules/api/__init__.py b/modules/api/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/extensions/openai/cache_embedding_model.py b/modules/api/cache_embedding_model.py
similarity index 100%
rename from extensions/openai/cache_embedding_model.py
rename to modules/api/cache_embedding_model.py
diff --git a/extensions/openai/completions.py b/modules/api/completions.py
similarity index 99%
rename from extensions/openai/completions.py
rename to modules/api/completions.py
index d0cd9802..8948bb86 100644
--- a/extensions/openai/completions.py
+++ b/modules/api/completions.py
@@ -9,9 +9,9 @@ import tiktoken
 import yaml
 from pydantic import ValidationError
 
-from extensions.openai.errors import InvalidRequestError
-from extensions.openai.typing import ToolDefinition
-from extensions.openai.utils import debug_msg
+from .errors import InvalidRequestError
+from .typing import ToolDefinition
+from .utils import debug_msg
 from modules.tool_parsing import get_tool_call_id, parse_tool_call, detect_tool_call_format
 from modules import shared
 from modules.reasoning import extract_reasoning
diff --git a/extensions/openai/embeddings.py b/modules/api/embeddings.py
similarity index 90%
rename from extensions/openai/embeddings.py
rename to modules/api/embeddings.py
index 1420879c..ad299c9d 100644
--- a/extensions/openai/embeddings.py
+++ b/modules/api/embeddings.py
@@ -3,8 +3,8 @@ import os
 import numpy as np
 from transformers import AutoModel
 
-from extensions.openai.errors import ServiceUnavailableError
-from extensions.openai.utils import debug_msg, float_list_to_base64
+from .errors import ServiceUnavailableError
+from .utils import debug_msg, float_list_to_base64
 from modules.logging_colors import logger
 
 embeddings_params_initialized = False
@@ -17,14 +17,12 @@ def initialize_embedding_params():
     '''
     global embeddings_params_initialized
     if not embeddings_params_initialized:
-        from extensions.openai.script import params
-
         global st_model, embeddings_model, embeddings_device
 
-        st_model = os.environ.get("OPENEDAI_EMBEDDING_MODEL", params.get('embedding_model', 'all-mpnet-base-v2'))
+        st_model = os.environ.get("OPENEDAI_EMBEDDING_MODEL", 'sentence-transformers/all-mpnet-base-v2')
         embeddings_model = None
         # OPENEDAI_EMBEDDING_DEVICE: auto (best or cpu), cpu, cuda, ipu, xpu, mkldnn, opengl, opencl, ideep, hip, ve, fpga, ort, xla, lazy, vulkan, mps, meta, hpu, mtia, privateuseone
-        embeddings_device = os.environ.get("OPENEDAI_EMBEDDING_DEVICE", params.get('embedding_device', 'cpu'))
+        embeddings_device = os.environ.get("OPENEDAI_EMBEDDING_DEVICE", 'cpu')
         if embeddings_device.lower() == 'auto':
             embeddings_device = None
 
diff --git a/extensions/openai/errors.py b/modules/api/errors.py
similarity index 100%
rename from extensions/openai/errors.py
rename to modules/api/errors.py
diff --git a/extensions/openai/images.py b/modules/api/images.py
similarity index 96%
rename from extensions/openai/images.py
rename to modules/api/images.py
index f7be3d22..95704535 100644
--- a/extensions/openai/images.py
+++ b/modules/api/images.py
@@ -6,7 +6,7 @@ import base64
 import io
 import time
 
-from extensions.openai.errors import ServiceUnavailableError
+from .errors import ServiceUnavailableError
 from modules import shared
 
 
diff --git a/extensions/openai/logits.py b/modules/api/logits.py
similarity index 84%
rename from extensions/openai/logits.py
rename to modules/api/logits.py
index 280612db..e0c7ea0e 100644
--- a/extensions/openai/logits.py
+++ b/modules/api/logits.py
@@ -1,4 +1,4 @@
-from extensions.openai.completions import process_parameters
+from .completions import process_parameters
 from modules.logits import get_next_logits
 
 
diff --git a/extensions/openai/models.py b/modules/api/models.py
similarity index 100%
rename from extensions/openai/models.py
rename to modules/api/models.py
diff --git a/extensions/openai/moderations.py b/modules/api/moderations.py
similarity index 97%
rename from extensions/openai/moderations.py
rename to modules/api/moderations.py
index 1ca6b8ab..ac0539d6 100644
--- a/extensions/openai/moderations.py
+++ b/modules/api/moderations.py
@@ -3,7 +3,7 @@ import time
 import numpy as np
 from numpy.linalg import norm
 
-from extensions.openai.embeddings import get_embeddings
+from .embeddings import get_embeddings
 
 moderations_disabled = False  # return 0/false
 category_embeddings = None
diff --git a/extensions/openai/script.py b/modules/api/script.py
similarity index 96%
rename from extensions/openai/script.py
rename to modules/api/script.py
index a0d5deb8..356919e9 100644
--- a/extensions/openai/script.py
+++ b/modules/api/script.py
@@ -13,16 +13,15 @@ from fastapi import Depends, FastAPI, Header, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.requests import Request
 from fastapi.responses import JSONResponse
-from pydub import AudioSegment
 from sse_starlette import EventSourceResponse
 from starlette.concurrency import iterate_in_threadpool
 
-import extensions.openai.completions as OAIcompletions
-import extensions.openai.logits as OAIlogits
-import extensions.openai.models as OAImodels
-from extensions.openai.tokens import token_count, token_decode, token_encode
-from extensions.openai.errors import OpenAIError
-from extensions.openai.utils import _start_cloudflared
+import modules.api.completions as OAIcompletions
+import modules.api.logits as OAIlogits
+import modules.api.models as OAImodels
+from .tokens import token_count, token_decode, token_encode
+from .errors import OpenAIError
+from .utils import _start_cloudflared
 from modules import shared
 from modules.logging_colors import logger
 from modules.models import unload_model
@@ -53,12 +52,6 @@ from .typing import (
     to_dict
 )
 
-params = {
-    'embedding_device': 'cpu',
-    'embedding_model': 'sentence-transformers/all-mpnet-base-v2',
-    'debug': 0
-}
-
 
 async def _wait_for_disconnect(request: Request, stop_event: threading.Event):
     """Block until the client disconnects, then signal the stop_event."""
@@ -244,6 +237,7 @@ def handle_billing_usage():
 @app.post('/v1/audio/transcriptions', dependencies=check_key)
 async def handle_audio_transcription(request: Request):
     import speech_recognition as sr
+    from pydub import AudioSegment
 
     r = sr.Recognizer()
 
@@ -275,7 +269,7 @@ async def handle_audio_transcription(request: Request):
 
 @app.post('/v1/images/generations', response_model=ImageGenerationResponse, dependencies=check_key)
 async def handle_image_generation(request_data: ImageGenerationRequest):
-    import extensions.openai.images as OAIimages
+    import modules.api.images as OAIimages
 
     response = await asyncio.to_thread(OAIimages.generations, request_data)
     return JSONResponse(response)
@@ -283,7 +277,7 @@ async def handle_image_generation(request_data: ImageGenerationRequest):
 
 @app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key)
 async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
-    import extensions.openai.embeddings as OAIembeddings
+    import modules.api.embeddings as OAIembeddings
 
     input = request_data.input
     if not input:
@@ -298,7 +292,7 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
 
 @app.post("/v1/moderations", dependencies=check_key)
 async def handle_moderations(request: Request):
-    import extensions.openai.moderations as OAImoderations
+    import modules.api.moderations as OAImoderations
 
     body = await request.json()
     input = body["input"]
@@ -500,7 +494,15 @@ def run_server():
     uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
 
 
+_server_started = False
+
+
 def setup():
+    global _server_started
+    if _server_started:
+        return
+
+    _server_started = True
     if shared.args.nowebui:
         run_server()
     else:
diff --git a/extensions/openai/tokens.py b/modules/api/tokens.py
similarity index 100%
rename from extensions/openai/tokens.py
rename to modules/api/tokens.py
diff --git a/extensions/openai/typing.py b/modules/api/typing.py
similarity index 100%
rename from extensions/openai/typing.py
rename to modules/api/typing.py
diff --git a/extensions/openai/utils.py b/modules/api/utils.py
similarity index 93%
rename from extensions/openai/utils.py
rename to modules/api/utils.py
index 2b414769..fae181ff 100644
--- a/extensions/openai/utils.py
+++ b/modules/api/utils.py
@@ -23,8 +23,7 @@ def float_list_to_base64(float_array: np.ndarray) -> str:
 
 
 def debug_msg(*args, **kwargs):
-    from extensions.openai.script import params
-    if os.environ.get("OPENEDAI_DEBUG", params.get('debug', 0)):
+    if os.environ.get("OPENEDAI_DEBUG", 0):
         print(*args, **kwargs)
 
 
diff --git a/modules/extensions.py b/modules/extensions.py
index 4bb7b683..09db9f40 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -32,8 +32,7 @@ def load_extensions():
         if name not in available_extensions:
             continue
 
-        if name != 'api':
-            logger.info(f'Loading the extension "{name}"')
+        logger.info(f'Loading the extension "{name}"')
 
         try:
             # Prefer user extension, fall back to system extension
diff --git a/modules/shared.py b/modules/shared.py
index 37bc5876..69e16960 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -156,7 +156,7 @@ group.add_argument('--portable', action='store_true', help='Hide features not av
 
 # API
 group = parser.add_argument_group('API')
-group.add_argument('--api', action='store_true', help='Enable the API extension.')
+group.add_argument('--api', action='store_true', help='Enable the API server.')
 group.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudflare.')
 group.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.', default=None)
 group.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.')
@@ -435,16 +435,6 @@ def fix_loader_name(name):
         return 'TensorRT-LLM'
 
 
-def add_extension(name, last=False):
-    if args.extensions is None:
-        args.extensions = [name]
-    elif last:
-        args.extensions = [x for x in args.extensions if x != name]
-        args.extensions.append(name)
-    elif name not in args.extensions:
-        args.extensions.append(name)
-
-
 def is_chat():
     return True
 
@@ -464,10 +454,6 @@ def load_user_config():
 
 args.loader = fix_loader_name(args.loader)
 
-# Activate the API extension
-if args.api or args.public_api:
-    add_extension('openai', last=True)
-
 # Load model-specific settings
 p = Path(f'{args.model_dir}/config.yaml')
 if p.exists():
diff --git a/modules/ui_session.py b/modules/ui_session.py
index 19026fbb..3f2c8a7b 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -95,8 +95,6 @@ def set_interface_arguments(extensions, bool_active):
         setattr(shared.args, k, False)
     for k in bool_active:
         setattr(shared.args, k, True)
-        if k == 'api':
-            shared.add_extension('openai', last=True)
 
     shared.need_restart = True
 
diff --git a/server.py b/server.py
index 1aa9fc04..cbdd2854 100644
--- a/server.py
+++ b/server.py
@@ -106,6 +106,11 @@ def create_interface():
     if shared.args.extensions is not None and len(shared.args.extensions) > 0:
         extensions_module.load_extensions()
 
+    # Start the API server if enabled
+    if shared.args.api or shared.args.public_api:
+        from modules.api.script import setup as api_setup
+        api_setup()
+
     # Force some events to be triggered on page load
     shared.persistent_interface_state.update({
         'mode': shared.settings['mode'],
@@ -273,6 +278,12 @@ if __name__ == "__main__":
     # Activate the extensions listed on settings.yaml
     extensions_module.available_extensions = utils.get_available_extensions()
     for extension in shared.settings['default_extensions']:
+        # The openai extension was moved to modules/api and is now
+        # activated with --api. Treat it as an alias for backwards compat.
+        if extension == 'openai':
+            shared.args.api = True
+            continue
+
         shared.args.extensions = shared.args.extensions or []
         if extension not in shared.args.extensions:
             shared.args.extensions.append(extension)
@@ -337,6 +348,10 @@ if __name__ == "__main__":
         shared.args.extensions = [x for x in (shared.args.extensions or []) if x != 'gallery']
         if shared.args.extensions:
             extensions_module.load_extensions()
+
+        if shared.args.api or shared.args.public_api:
+            from modules.api.script import setup as api_setup
+            api_setup()
     else:
         # Launch the web UI
         create_interface()