From 807be1183272fac409ce8f08609dbdd0d9f63362 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 18:48:50 -0700
Subject: [PATCH 01/76] Remove obsolete models/config.yaml and related code

---
 docs/01 - Chat Tab.md        |   2 +-
 docs/12 - OpenAI API.md      |   2 +-
 modules/models.py            |   1 -
 modules/models_settings.py   |   9 +-
 modules/shared.py            |  10 --
 server.py                    |   5 -
 user_data/models/config.yaml | 203 -----------------------------------
 7 files changed, 4 insertions(+), 228 deletions(-)
 delete mode 100644 user_data/models/config.yaml

diff --git a/docs/01 - Chat Tab.md b/docs/01 - Chat Tab.md
index 5104895f..96b232fa 100644
--- a/docs/01 - Chat Tab.md	
+++ b/docs/01 - Chat Tab.md	
@@ -112,7 +112,7 @@ Used for talking to an instruction-following model using the prompt format defin
 
 The prompt format is defined by the **Instruction template** parameter in "Parameters" > "Instruction template", which represents a Jinja2 template.
 
-Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any), and will update the values under "Parameters" > "Instruction template" accordingly. This is done using a set of regular expressions defined in `user_data/models/config.yaml`. This detection is not guaranteed to be accurate. You should check the model card on Hugging Face to see if you are using the correct prompt format.
+Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any) from the model metadata (e.g. `tokenizer_config.json` or GGUF metadata), and will update the values under "Parameters" > "Instruction template" accordingly. You should check the model card on Hugging Face to see if you are using the correct prompt format.
 
 ### Chat-instruct
 
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 2a7a7f69..0a076c35 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -39,7 +39,7 @@ curl http://127.0.0.1:5000/v1/completions \
 
 #### Chat completions
 
-Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be guessed automatically based on the model name using the regex patterns in `user_data/models/config.yaml`.
+Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be detected automatically from the model metadata.
 
 ```shell
 curl http://127.0.0.1:5000/v1/chat/completions \
diff --git a/modules/models.py b/modules/models.py
index 1d139b89..b2665c6b 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -67,7 +67,6 @@ def load_model(model_name, loader=None):
     logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
     logger.info(f"LOADER: \"{loader}\"")
     logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
-    logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"")
     return model, tokenizer
 
 
diff --git a/modules/models_settings.py b/modules/models_settings.py
index dcface71..eafa0581 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -23,14 +23,9 @@ def get_fallback_settings():
 
 def get_model_metadata(model):
     model_path = resolve_model_path(model)
-    model_settings = {}
 
-    # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
-    settings = shared.model_config
-    for pat in settings:
-        if re.match(pat.lower(), Path(model).name.lower()):
-            for k in settings[pat]:
-                model_settings[k] = settings[pat][k]
+    # Fallback settings
+    model_settings = get_fallback_settings()
 
     path = model_path / 'config.json'
     if path.exists():
diff --git a/modules/shared.py b/modules/shared.py
index 16ccbe77..acb103b4 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -454,17 +454,7 @@ def load_user_config():
 
 args.loader = fix_loader_name(args.loader)
 
-# Load model-specific settings
-p = Path(f'{args.model_dir}/config.yaml')
-if p.exists():
-    model_config = yaml.safe_load(open(p, 'r').read())
-else:
-    model_config = {}
-del p
-
-
 # Load custom model-specific settings
 user_config = load_user_config()
 
-model_config = OrderedDict(model_config)
 user_config = OrderedDict(user_config)
diff --git a/server.py b/server.py
index d224909c..88936ca6 100644
--- a/server.py
+++ b/server.py
@@ -18,7 +18,6 @@ import modules.extensions as extensions_module
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model_if_idle
 from modules.models_settings import (
-    get_fallback_settings,
     get_model_metadata,
     update_model_parameters
 )
@@ -271,10 +270,6 @@ if __name__ == "__main__":
     # Apply CLI overrides for image model settings (CLI flags take precedence over saved settings)
     shared.apply_image_model_cli_overrides()
 
-    # Fallback settings for models
-    shared.model_config['.*'] = get_fallback_settings()
-    shared.model_config.move_to_end('.*', last=False)  # Move to the beginning
-
     # Activate the extensions listed on settings.yaml
     extensions_module.available_extensions = utils.get_available_extensions()
     for extension in shared.settings['default_extensions']:
diff --git a/user_data/models/config.yaml b/user_data/models/config.yaml
deleted file mode 100644
index 038ebcf1..00000000
--- a/user_data/models/config.yaml
+++ /dev/null
@@ -1,203 +0,0 @@
-.*(llama|alpac|vicuna|guanaco|koala|llava|wizardlm|metharme|pygmalion-7b|pygmalion-2|mythalion|wizard-mega|openbuddy|vigogne|h2ogpt-research|manticore):
-  model_type: 'llama'
-.*(opt-|opt_|opt1|opt3|optfor|galactica|galpaca|pygmalion-350m):
-  model_type: 'opt'
-.*(gpt-j|gptj|gpt4all-j|malion-6b|pygway|pygmalion-6b|dolly-v1):
-  model_type: 'gptj'
-.*(gpt-neox|koalpaca-polyglot|polyglot.*koalpaca|polyglot-ko|polyglot_ko|pythia|stablelm|incite|dolly-v2|polycoder|h2ogpt-oig|h2ogpt-oasst1|h2ogpt-gm):
-  model_type: 'gptneox'
-.*bloom:
-  model_type: 'bloom'
-.*gpt2:
-  model_type: 'gpt2'
-.*falcon:
-  model_type: 'falcon'
-.*mpt:
-  model_type: 'mpt'
-.*(starcoder|starchat):
-  model_type: 'starcoder'
-.*dolly-v2:
-  model_type: 'dollyv2'
-.*replit:
-  model_type: 'replit'
-.*(oasst|openassistant-|stablelm-7b-sft-v7-epoch-3):
-  instruction_template: 'Open Assistant'
-  skip_special_tokens: false
-(?!.*galactica)(?!.*reward).*openassistant:
-  instruction_template: 'Open Assistant'
-  skip_special_tokens: false
-.*galactica:
-  skip_special_tokens: false
-.*dolly-v[0-9]-[0-9]*b:
-  instruction_template: 'Alpaca'
-  skip_special_tokens: false
-.*alpaca-native-4bit:
-  instruction_template: 'Alpaca'
-.*llava:
-  instruction_template: 'LLaVA'
-.*llava.*1.5:
-  instruction_template: 'Vicuna-v1.1'
-.*wizard.*mega:
-  instruction_template: 'Wizard-Mega'
-.*starchat-beta:
-  instruction_template: 'Starchat-Beta'
-(?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna:
-  instruction_template: 'Vicuna-v0'
-.*vicuna.*v0:
-  instruction_template: 'Vicuna-v0'
-.*vicuna.*(1.1|1_1|1.3|1_3):
-  instruction_template: 'Vicuna-v1.1'
-.*vicuna.*(1.5|1_5):
-  instruction_template: 'Vicuna-v1.1'
-.*stable.*vicuna:
-  instruction_template: 'StableVicuna'
-(?!.*chat).*chinese-vicuna:
-  instruction_template: 'Alpaca'
-.*chinese-vicuna.*chat:
-  instruction_template: 'Chinese-Vicuna-Chat'
-.*alpaca:
-  instruction_template: 'Alpaca'
-.*koala:
-  instruction_template: 'Koala'
-.*chatglm:
-  instruction_template: 'ChatGLM'
-.*(metharme|pygmalion|mythalion):
-  instruction_template: 'Metharme'
-.*raven:
-  instruction_template: 'RWKV-Raven'
-.*moss-moon.*sft:
-  instruction_template: 'MOSS'
-.*stablelm-tuned:
-  instruction_template: 'StableLM'
-.*galactica.*finetuned:
-  instruction_template: 'Galactica Finetuned'
-.*galactica.*-v2:
-  instruction_template: 'Galactica v2'
-(?!.*finetuned)(?!.*-v2).*galactica:
-  instruction_template: 'Galactica'
-.*guanaco:
-  instruction_template: 'Guanaco non-chat'
-.*baize:
-  instruction_template: 'Baize'
-.*mpt-.*instruct:
-  instruction_template: 'Alpaca'
-.*mpt-.*chat:
-  instruction_template: 'ChatML'
-(?!.*-flan-)(?!.*-t5-).*lamini-:
-  instruction_template: 'Alpaca'
-.*incite.*chat:
-  instruction_template: 'INCITE-Chat'
-.*incite.*instruct:
-  instruction_template: 'INCITE-Instruct'
-.*ziya-:
-  instruction_template: 'Ziya'
-.*koalpaca:
-  instruction_template: 'KoAlpaca'
-.*openbuddy:
-  instruction_template: 'OpenBuddy'
-(?!.*chat).*vigogne:
-  instruction_template: 'Vigogne-Instruct'
-.*vigogne.*chat:
-  instruction_template: 'Vigogne-Chat'
-.*(llama-deus|supercot|llama-natural-instructions|open-llama-0.3t-7b-instruct-dolly-hhrlhf|open-llama-0.3t-7b-open-instruct):
-  instruction_template: 'Alpaca'
-.*bactrian:
-  instruction_template: 'Bactrian'
-.*(h2ogpt-oig-|h2ogpt-oasst1-|h2ogpt-research-oasst1-):
-  instruction_template: 'INCITE-Chat'
-.*h2ogpt-gm-:
-  instruction_template: 'H2O-prompt_answer'
-.*manticore:
-  instruction_template: 'Manticore Chat'
-.*bluemoonrp-(30|13)b:
-  instruction_template: 'Bluemoon'
-.*Nous-Hermes-13b:
-  instruction_template: 'Alpaca'
-.*airoboros:
-  instruction_template: 'Vicuna-v1.1'
-.*airoboros.*1.2:
-  instruction_template: 'Airoboros-v1.2'
-.*alpa(cino|sta):
-  instruction_template: 'Alpaca'
-.*hippogriff:
-  instruction_template: 'Hippogriff'
-.*lazarus:
-  instruction_template: 'Alpaca'
-.*guanaco-.*(7|13|33|65)b:
-  instruction_template: 'Vicuna-v0'
-.*hypermantis:
-  instruction_template: 'Alpaca'
-.*open-llama-.*-open-instruct:
-  instruction_template: 'Alpaca'
-.*starcoder-gpteacher-code-instruct:
-  instruction_template: 'Alpaca'
-.*tulu:
-  instruction_template: 'Tulu'
-.*chronos:
-  instruction_template: 'Alpaca'
-.*samantha:
-  instruction_template: 'Samantha'
-.*wizardcoder:
-  instruction_template: 'Alpaca'
-.*minotaur:
-  instruction_template: 'Manticore Chat'
-.*orca_mini:
-  instruction_template: 'Orca Mini'
-.*(platypus|gplatty|superplatty):
-  instruction_template: 'Alpaca'
-.*(openorca-platypus2):
-  instruction_template: 'OpenOrca-Platypus2'
-.*longchat:
-  instruction_template: 'Vicuna-v1.1'
-.*vicuna-33b:
-  instruction_template: 'Vicuna-v1.1'
-.*redmond-hermes-coder:
-  instruction_template: 'Alpaca'
-.*wizardcoder-15b:
-  instruction_template: 'Alpaca'
-.*wizardlm:
-  instruction_template: 'Vicuna-v1.1'
-.*godzilla:
-  instruction_template: 'Alpaca'
-.*llama(-?)(2|v2).*chat:
-  instruction_template: 'Llama-v2'
-.*newhope:
-  instruction_template: 'NewHope'
-.*stablebeluga2:
-  instruction_template: 'StableBeluga2'
-.*openchat:
-  instruction_template: 'OpenChat'
-.*codellama.*instruct:
-  instruction_template: 'Llama-v2'
-.*(mistral|mixtral).*instruct:
-  instruction_template: 'Mistral'
-.*mistral.*openorca:
-  instruction_template: 'ChatML'
-.*(WizardCoder-Python-34B-V1.0|Phind-CodeLlama-34B-v2|CodeBooga-34B-v0.1):
-  instruction_template: 'Alpaca'
-.*orca-2-(13|7)b:
-  instruction_template: 'ChatML'
-.*openhermes.*mistral:
-  instruction_template: 'ChatML'
-.*Yi-34B-Chat:
-  instruction_template: 'ChatML'
-(dolphin).*:
-  instruction_template: 'ChatML'
-.*synthia:
-  instruction_template: 'Synthia'
-.*(hercules|hyperion):
-  instruction_template: 'ChatML'
-.*command-r:
-  instruction_template: 'Command-R'
-.*xwin-lm-70b-v0.1:
-  instruction_template: 'Vicuna-v1.1'
-.*platypus-yi-34b:
-  instruction_template: 'Vicuna-v1.1'
-.*CausalLM-RP-34B:
-  instruction_template: 'ChatML'
-34b-beta:
-  instruction_template: 'ChatML'
-.*airoboros-3_1-yi-34b-200k:
-  instruction_template: 'Llama-v2'
-.*chatqa:
-  instruction_template: 'NVIDIA-ChatQA'

From d6f1485dd189494f6fbe5b6ea7ebd5cc0404233a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 21:45:11 -0700
Subject: [PATCH 02/76] UI: Update the enable_thinking info message

---
 modules/ui_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index f1dc7883..10d05f65 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -82,7 +82,7 @@ def create_ui():
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
                 shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
-                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.')
+                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='For models with thinking support.')
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 

From 368f37335f634ba001d00d2841902de85c7b48db Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 25 Mar 2026 06:37:45 -0700
Subject: [PATCH 03/76] Fix --idle-timeout issues with encode/decode and
 parallel generation

---
 modules/logits.py          |  4 +---
 modules/models.py          | 15 ++++++++++++++-
 modules/text_generation.py | 18 +++++++++++++-----
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/modules/logits.py b/modules/logits.py
index 1f878f27..473f5890 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -4,7 +4,6 @@ import numpy as np
 
 from modules import models, shared
 from modules.logging_colors import logger
-from modules.models import load_model
 from modules.text_generation import generate_reply
 from modules.utils import check_model_loaded
 
@@ -12,8 +11,7 @@ global_scores = None
 
 
 def get_next_logits(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+    models.load_model_if_idle_unloaded()
 
     needs_lock = not args[2]  # use_samplers
     if needs_lock:
diff --git a/modules/models.py b/modules/models.py
index b2665c6b..61ca3838 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,4 +1,5 @@
 import sys
+import threading
 import time
 
 import modules.shared as shared
@@ -7,6 +8,15 @@ from modules.models_settings import get_model_metadata
 from modules.utils import resolve_model_path
 
 last_generation_time = time.time()
+active_generation_count = 0
+_generation_count_lock = threading.Lock()
+
+
+def load_model_if_idle_unloaded():
+    global last_generation_time
+    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
+        shared.model, shared.tokenizer = load_model(shared.model_name)
+        last_generation_time = time.time()
 
 
 def load_model(model_name, loader=None):
@@ -158,7 +168,10 @@ def unload_model_if_idle():
     while True:
         shared.generation_lock.acquire()
         try:
-            if time.time() - last_generation_time > shared.args.idle_timeout * 60:
+            with _generation_count_lock:
+                is_active = active_generation_count > 0
+
+            if not is_active and time.time() - last_generation_time > shared.args.idle_timeout * 60:
                 if shared.model is not None:
                     logger.info("Unloading the model for inactivity.")
                     unload_model(keep_model_name=True)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f77be124..3a9ddab5 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -17,9 +17,7 @@ from modules.utils import check_model_loaded
 
 
 def generate_reply(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
-        from modules.models import load_model
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+    models.load_model_if_idle_unloaded()
 
     state = args[1] if len(args) > 1 else kwargs.get('state', {})
     use_parallel = (
@@ -31,10 +29,16 @@ def generate_reply(*args, **kwargs):
     if not use_parallel:
         shared.generation_lock.acquire()
 
+    with models._generation_count_lock:
+        models.active_generation_count += 1
+
     try:
         for result in _generate_reply(*args, **kwargs):
             yield result
     finally:
+        with models._generation_count_lock:
+            models.active_generation_count -= 1
+
         models.last_generation_time = time.time()
         if not use_parallel:
             shared.generation_lock.release()
@@ -126,7 +130,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
 
 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
     if shared.tokenizer is None:
-        raise ValueError('No tokenizer is loaded')
+        models.load_model_if_idle_unloaded()
+        if shared.tokenizer is None:
+            raise ValueError('No tokenizer is loaded')
 
     # llama.cpp case
     if shared.model.__class__.__name__ == 'LlamaServer':
@@ -176,7 +182,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
 
 def decode(output_ids, skip_special_tokens=True):
     if shared.tokenizer is None:
-        raise ValueError('No tokenizer is loaded')
+        models.load_model_if_idle_unloaded()
+        if shared.tokenizer is None:
+            raise ValueError('No tokenizer is loaded')
 
     return shared.tokenizer.decode(output_ids, skip_special_tokens=skip_special_tokens)
 

From e1541400219043f9b9cebf5f002b48251efc8bf9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 25 Mar 2026 07:21:02 -0700
Subject: [PATCH 04/76] Rename "truncation length" to "context length" in logs

---
 modules/api/models.py | 2 +-
 modules/models.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/api/models.py b/modules/api/models.py
index c879a860..b89397d3 100644
--- a/modules/api/models.py
+++ b/modules/api/models.py
@@ -68,7 +68,7 @@ def _load_model(data):
             if k in shared.settings:
                 shared.settings[k] = settings[k]
                 if k == 'truncation_length':
-                    logger.info(f"TRUNCATION LENGTH (UPDATED): {shared.settings['truncation_length']}")
+                    logger.info(f"CONTEXT LENGTH (UPDATED): {shared.settings['truncation_length']}")
                 elif k == 'instruction_template':
                     logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}")
 
diff --git a/modules/models.py b/modules/models.py
index 61ca3838..e997d2d8 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -76,7 +76,7 @@ def load_model(model_name, loader=None):
 
     logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
     logger.info(f"LOADER: \"{loader}\"")
-    logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
+    logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}")
     return model, tokenizer
 
 

From 4cbea02ed4e0dee2efd066ac48bcdf33631b9eca Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 26 Mar 2026 06:49:39 -0700
Subject: [PATCH 05/76] Add ik_llama.cpp support via `--ik` flag

---
 modules/llama_cpp_server.py | 37 +++++++++++++++++++++++++++++++++++++
 modules/shared.py           |  1 +
 2 files changed, 38 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2ae01ddc..9b9756a9 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -470,6 +470,10 @@ class LlamaServer:
                         else:
                             cmd.append(f"--{flag_item}")
 
+        # Patch flags for ik_llama.cpp compatibility
+        if shared.args.ik:
+            cmd = _patch_cmd_for_ik(cmd)
+
         env = os.environ.copy()
         if os.name == 'posix':
             current_path = env.get('LD_LIBRARY_PATH', '')
@@ -607,3 +611,36 @@ def filter_stderr_with_progress(process_stderr):
             process_stderr.close()
         except Exception:
             pass
+
+
+def _patch_cmd_for_ik(cmd):
+    """
+    Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
+      --no-webui          → --webui none
+      --fit off            → (removed)
+      --fit on / --fit-ctx → --fit (bare flag)
+      --fit-target         → --fit-margin
+    """
+    patched = []
+    i = 0
+    while i < len(cmd):
+        arg = cmd[i]
+
+        if arg == "--no-webui":
+            patched += ["--webui", "none"]
+        elif arg == "--fit" and i + 1 < len(cmd) and cmd[i + 1] in ("on", "off"):
+            val = cmd[i + 1]
+            i += 1
+            if val == "on":
+                patched.append("--fit")
+            # "off" → drop entirely
+        elif arg == "--fit-ctx":
+            i += 1  # skip the value
+        elif arg == "--fit-target":
+            patched.append("--fit-margin")
+        else:
+            patched.append(arg)
+
+        i += 1
+
+    return patched
diff --git a/modules/shared.py b/modules/shared.py
index acb103b4..c50736d7 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -110,6 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
 group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside <venv>/lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')

From bda95172bd6abecba165fc118f140cfc446f3c42 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 06:09:53 -0700
Subject: [PATCH 06/76] Fix stopping string detection for chromadb/context-1

---
 modules/chat.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index f8088e0f..edda11b0 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -671,7 +671,10 @@ def get_stopping_strings(state):
     # Handle GPT-OSS as a special case
     if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
         result.remove("<|end|>")
-        result.append("<|result|>")
+        if '<|result|>' in state['instruction_template_str']:
+            result.append("<|result|>")
+        elif '<|return|>' in state['instruction_template_str']:
+            result.append("<|return|>")
         result = list(set(result))
 
     if shared.args.verbose:

From 9dd04b86ce407507bcaf0862b97aadc64b6e62a6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 06:17:57 -0700
Subject: [PATCH 07/76] Suppress EOS token at logit level for ExLlamav3 when
 ban_eos_token is set

---
 modules/exllamav3.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 75c76c7c..f873503a 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -423,6 +423,15 @@ class Exllamav3Model:
         if logit_bias:
             filters.append(LogitBiasFilter(self.tokenizer, logit_bias))
 
+        # Suppress EOS tokens via logit bias so they are never sampled
+        if state['ban_eos_token']:
+            eos_bias = {}
+            for eos_id in self.config.eos_token_id_list:
+                if eos_id is not None:
+                    eos_bias[str(eos_id)] = float('-inf')
+            if eos_bias:
+                filters.append(LogitBiasFilter(self.tokenizer, eos_bias))
+
         # Logprobs support (OpenAI API)
         logprobs = state.get('logprobs', 0) or 0
         return_top_tokens = logprobs if logprobs > 0 else 0

From 4979e87e48c78d5e3186e4d9b2fbc8b30e86164f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 11:49:47 -0300
Subject: [PATCH 08/76] Add ik_llama.cpp support via ik_llama_cpp_binaries
 package

---
 .github/workflows/build-everything-tgw.yml    |  35 +++
 .../build-portable-release-ik-cuda.yml        | 179 +++++++++++++++
 .../workflows/build-portable-release-ik.yml   | 205 ++++++++++++++++++
 modules/llama_cpp_server.py                   |  21 +-
 modules/loaders.py                            |   2 +
 modules/shared.py                             |   2 +-
 modules/ui_model_menu.py                      |   3 +
 requirements/full/requirements.txt            |   6 +-
 requirements/full/requirements_amd.txt        |   4 +-
 .../full/requirements_apple_intel.txt         |   3 +-
 .../full/requirements_apple_silicon.txt       |   3 +-
 requirements/full/requirements_cpu_only.txt   |   6 +-
 requirements/portable/requirements.txt        |   4 +-
 requirements/portable/requirements_amd.txt    |   4 +-
 .../portable/requirements_apple_intel.txt     |   2 +-
 .../portable/requirements_apple_silicon.txt   |   2 +-
 .../portable/requirements_cpu_only.txt        |   4 +-
 .../portable/requirements_cuda131.txt         |   4 +-
 requirements/portable/requirements_vulkan.txt |   4 +-
 19 files changed, 469 insertions(+), 24 deletions(-)
 create mode 100644 .github/workflows/build-portable-release-ik-cuda.yml
 create mode 100644 .github/workflows/build-portable-release-ik.yml

diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 9322f859..4de591f4 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -68,3 +68,38 @@ jobs:
     with:
       version: ${{ inputs.version }}
       config: 'os:macos-15-intel,macos-14'
+
+  build_release_ik_cuda_windows:
+    name: ik CUDA Windows
+    uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
+  build_release_ik_cuda_linux:
+    name: ik CUDA Linux
+    uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
+
+  build_release_ik_cpu_windows:
+    name: ik CPU Windows
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
+  build_release_ik_cpu_linux:
+    name: ik CPU Linux
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
+
+  build_release_ik_macos:
+    name: ik macOS
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:macos-14'
diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml
new file mode 100644
index 00000000..40b4b92f
--- /dev/null
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@@ -0,0 +1,179 @@
+name: Build ik CUDA
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04', 'windows-2022')
+              'pyver' = @("3.13")
+              'cuda' = @("12.4", "13.1")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }} CUDA ${{ matrix.cuda }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            CUDA_VERSION="${{ matrix.cuda }}"
+            VERSION="${{ inputs.version }}"
+
+            # 1. Set platform-specific variables
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                PLATFORM="windows"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/python.exe -m pip"
+                PACKAGES_PATH="portable_env/Lib/site-packages"
+                rm start_linux.sh start_macos.sh
+            else
+                PLATFORM="linux"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_macos.sh start_windows.bat
+            fi
+
+            # 2. Download and extract Python
+            cd ..
+            echo "Downloading Python for $PLATFORM..."
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file based on CUDA version
+            cd "text-generation-webui-${VERSION_CLEAN}"
+            if [[ "$CUDA_VERSION" == "13.1" ]]; then
+                REQ_FILE="requirements/portable/requirements_cuda131.txt"
+            else
+                REQ_FILE="requirements/portable/requirements.txt"
+            fi
+
+            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
+            sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true
+
+            # 5. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 6. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 7. Create archive
+            cd ..
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+            else
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+            fi
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-ik-*
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml
new file mode 100644
index 00000000..afb2e763
--- /dev/null
+++ b/.github/workflows/build-portable-release-ik.yml
@@ -0,0 +1,205 @@
+name: Build ik CPU and macOS
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
+              'pyver' = @("3.13")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            VERSION="${{ inputs.version }}"
+            OS_TYPE="${{ matrix.os }}"
+
+            # 1. Set platform-specific variables
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                PLATFORM="windows-cpu"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/python.exe -m pip"
+                PACKAGES_PATH="portable_env/Lib/site-packages"
+                rm start_linux.sh start_macos.sh
+            elif [[ "$RUNNER_OS" == "macOS" ]]; then
+                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
+                    PLATFORM="macos-x86_64"
+                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz"
+                    REQ_TYPE="apple_intel"
+                else
+                    PLATFORM="macos-arm64"
+                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz"
+                    REQ_TYPE="apple_silicon"
+                fi
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_linux.sh start_windows.bat
+            else
+                # Linux case
+                PLATFORM="linux-cpu"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_macos.sh start_windows.bat
+            fi
+
+            # 2. Download and extract Python
+            echo "Downloading Python for $PLATFORM..."
+            cd ..
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file based on platform
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Select requirements file based on platform
+            if [[ "$RUNNER_OS" == "macOS" ]]; then
+                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
+                    REQ_FILE="requirements/portable/requirements_apple_intel.txt"
+                else
+                    REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
+                fi
+            else
+                REQ_FILE="requirements/portable/requirements_cpu_only.txt"
+            fi
+
+            echo "Using requirements file: $REQ_FILE"
+
+            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
+            if [[ "$RUNNER_OS" == "macOS" ]]; then
+                sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+                sed -i '' 's/--portable/--portable --ik/g' start_macos.sh
+            else
+                sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+                sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
+            fi
+
+            # 5. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 6. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 7. Create archive
+            cd ..
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+            else
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+            fi
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-ik-*
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 9b9756a9..5e2decfa 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -11,7 +11,6 @@ import time
 from pathlib import Path
 from typing import Any, List
 
-import llama_cpp_binaries
 import requests
 
 from modules import shared
@@ -357,7 +356,16 @@ class LlamaServer:
         """Start the llama.cpp server and wait until it's ready."""
         # Determine the server path
         if self.server_path is None:
-            self.server_path = llama_cpp_binaries.get_binary_path()
+            if shared.args.ik:
+                try:
+                    import ik_llama_cpp_binaries
+                except ImportError:
+                    raise ImportError("--ik requires the ik_llama_cpp_binaries package. Install it with: pip install <ik_llama_cpp_binaries wheel URL>")
+
+                self.server_path = ik_llama_cpp_binaries.get_binary_path()
+            else:
+                import llama_cpp_binaries
+                self.server_path = llama_cpp_binaries.get_binary_path()
 
         # Build the command
         cmd = [
@@ -616,10 +624,12 @@ def filter_stderr_with_progress(process_stderr):
 def _patch_cmd_for_ik(cmd):
     """
     Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
-      --no-webui          → --webui none
+      --no-webui           → --webui none
       --fit off            → (removed)
       --fit on / --fit-ctx → --fit (bare flag)
       --fit-target         → --fit-margin
+      --cache-reuse        → (removed, unsupported)
+      --swa-full           → (removed, unsupported)
     """
     patched = []
     i = 0
@@ -635,9 +645,14 @@ def _patch_cmd_for_ik(cmd):
                 patched.append("--fit")
             # "off" → drop entirely
         elif arg == "--fit-ctx":
+            patched.append("--fit")
             i += 1  # skip the value
         elif arg == "--fit-target":
             patched.append("--fit-margin")
+        elif arg == "--cache-reuse":
+            i += 1  # skip the value
+        elif arg == "--swa-full":
+            pass  # bare flag, just drop it
         else:
             patched.append(arg)
 
diff --git a/modules/loaders.py b/modules/loaders.py
index c90f2ebb..cb1f3d3b 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -20,6 +20,7 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'mlock',
         'numa',
+        'ik',
         'parallel',
         'model_draft',
         'draft_max',
@@ -345,6 +346,7 @@ def list_model_elements():
         'spec_ngram_size_m',
         'spec_ngram_min_hits',
         'mmproj',
+        'ik',
     ]
 
 
diff --git a/modules/shared.py b/modules/shared.py
index c50736d7..13843f0c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -110,7 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
 group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
-group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside <venv>/lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. Requires the ik_llama_cpp_binaries package to be installed.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 5b7621a7..16505afa 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -51,6 +51,9 @@ def create_ui():
 
                         with gr.Column():
                             shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
+                            if not shared.args.portable:
+                                shared.gradio['ik'] = gr.Checkbox(label="ik", value=shared.args.ik, info='Use ik_llama.cpp instead of upstream llama.cpp.')
+
                             shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 56619627..100c99d1 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 620683cc..66fa4ac7 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b1f109b2..98dc8be6 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index a54476a9..e33264cf 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index be82c904..cd083f6d 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,5 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 188da380..67182225 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 4562b6d0..5f5b2f8d 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 04dcf25e..f5f7d6ee 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 4b8af78a..e51fc296 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 5b0eaf89..683f94c8 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 90b3234f..942d0877 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index ea72b4ec..ae784e00 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From be6fc0663ac1b7a60b7fde24afb38de2b0aba57b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 08:11:28 -0700
Subject: [PATCH 09/76] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 14 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 100c99d1..6e11dd2f 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 66fa4ac7..c964eff6 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 98dc8be6..b1dd6a4f 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index e33264cf..4d03d280 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index cd083f6d..9d41d069 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 77c254e6..052085cc 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 67182225..ff80b6c8 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 5f5b2f8d..318044da 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index f5f7d6ee..1676bffb 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index e51fc296..27fc2da8 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 683f94c8..0bbdd30a 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 942d0877..c3ae3c57 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index e8457909..e38140ce 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index ae784e00..e646c04c 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 0466b6e2714a05c04eff0c929f15e4679f029e8d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 29 Mar 2026 15:52:36 -0700
Subject: [PATCH 10/76] ik_llama.cpp: Auto-enable Hadamard KV cache rotation
 with quantized cache

---
 modules/llama_cpp_server.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 5e2decfa..fa968be1 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -631,6 +631,12 @@ def _patch_cmd_for_ik(cmd):
       --cache-reuse        → (removed, unsupported)
       --swa-full           → (removed, unsupported)
     """
+    # Add Hadamard KV cache rotation when using quantized cache types.
+    # This significantly improves quantized cache quality (especially q4_0)
+    # and is a no-op for MLA models like DeepSeek.
+    if shared.args.cache_type in ("q8_0", "q4_0"):
+        cmd += ["-khad", "-vhad"]
+
     patched = []
     i = 0
     while i < len(cmd):

From 6382fbef8381bf60ff909b4fd76e7c1f4c063afc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 30 Mar 2026 17:44:19 -0700
Subject: [PATCH 11/76] Several small code simplifications

---
 download-model.py        |  25 +++---
 js/dark_theme.js         |  12 ++-
 js/global_scope_js.js    |  79 +++++++++---------
 js/main.js               | 171 +++++++++++++--------------------------
 js/save_files.js         |  18 ++---
 js/show_controls.js      |  21 ++---
 js/switch_tabs.js        |  24 ++----
 js/update_big_picture.js |   3 +-
 modules/extensions.py    |  22 +++--
 9 files changed, 140 insertions(+), 235 deletions(-)

diff --git a/download-model.py b/download-model.py
index 95d25e16..a31bbfc6 100644
--- a/download-model.py
+++ b/download-model.py
@@ -158,28 +158,21 @@ class ModelDownloader:
         # Also if GGUF and safetensors are available, download only safetensors
         if (has_pytorch or has_pt or has_gguf) and has_safetensors:
             has_gguf = False
-            for i in range(len(classifications) - 1, -1, -1):
-                if classifications[i] in ['pytorch', 'pt', 'gguf']:
-                    links.pop(i)
-                    file_sizes.pop(i)
+            keep = [i for i, c in enumerate(classifications) if c not in ['pytorch', 'pt', 'gguf']]
+            links = [links[i] for i in keep]
+            file_sizes = [file_sizes[i] for i in keep]
 
         # For GGUF, try to download only the Q4_K_M if no specific file is specified.
         if has_gguf and specific_file is None:
-            has_q4km = False
-            for i in range(len(classifications) - 1, -1, -1):
-                if 'q4_k_m' in links[i].lower():
-                    has_q4km = True
+            has_q4km = any('q4_k_m' in link.lower() for link in links)
 
             if has_q4km:
-                for i in range(len(classifications) - 1, -1, -1):
-                    if 'q4_k_m' not in links[i].lower():
-                        links.pop(i)
-                        file_sizes.pop(i)
+                keep = [i for i, link in enumerate(links) if 'q4_k_m' in link.lower()]
             else:
-                for i in range(len(classifications) - 1, -1, -1):
-                    if links[i].lower().endswith('.gguf'):
-                        links.pop(i)
-                        file_sizes.pop(i)
+                keep = [i for i, link in enumerate(links) if not link.lower().endswith('.gguf')]
+
+            links = [links[i] for i in keep]
+            file_sizes = [file_sizes[i] for i in keep]
 
         is_llamacpp = has_gguf and specific_file is not None
         return links, sha256, is_lora, is_llamacpp, file_sizes
diff --git a/js/dark_theme.js b/js/dark_theme.js
index 7136f5bf..9d7069e2 100644
--- a/js/dark_theme.js
+++ b/js/dark_theme.js
@@ -1,6 +1,6 @@
 function toggleDarkMode() {
   document.body.classList.toggle("dark");
-  var currentCSS = document.getElementById("highlight-css");
+  const currentCSS = document.getElementById("highlight-css");
   if (currentCSS.getAttribute("href") === "file/css/highlightjs/github-dark.min.css") {
     currentCSS.setAttribute("href", "file/css/highlightjs/github.min.css");
   } else {
@@ -9,12 +9,10 @@ function toggleDarkMode() {
 
   // Re-highlight all code blocks once stylesheet loads
   currentCSS.onload = function() {
-    const messageBodies = document.getElementById("chat").querySelectorAll(".message-body");
-    messageBodies.forEach((messageBody) => {
-      const codeBlocks = messageBody.querySelectorAll("pre code");
-      codeBlocks.forEach((codeBlock) => {
-        hljs.highlightElement(codeBlock);
-      });
+    // Clear data-highlighted so hljs will re-process with the new theme
+    document.querySelectorAll("#chat .message-body pre code[data-highlighted]").forEach((codeBlock) => {
+      delete codeBlock.dataset.highlighted;
     });
+    doSyntaxHighlighting();
   };
 }
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 92f65622..20eeef66 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -1,11 +1,35 @@
+// -------------------------------------------------
+// Shared helpers
+// -------------------------------------------------
+
+function getProfilePictureUrl() {
+  return "/file/user_data/cache/pfp_character.png?time=" + Date.now();
+}
+
+const MESSAGE_SELECTOR = ".message, .user-message, .assistant-message";
+
+function getMessageElement(element) {
+  if (!element) return null;
+  return element.closest(MESSAGE_SELECTOR);
+}
+
+function isUserRole(messageElement) {
+  return messageElement.classList.contains("user-message") ||
+         messageElement.querySelector(".text-you") !== null ||
+         messageElement.querySelector(".circle-you") !== null;
+}
+
+// Trigger a synthetic 'input' event so Gradio picks up programmatic value changes
+function dispatchGradioInput(element) {
+  element.dispatchEvent(new Event("input", { bubbles: true }));
+}
+
 // -------------------------------------------------
 // Event handlers
 // -------------------------------------------------
 
 function copyToClipboard(element) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
   if (!messageElement) return;
 
   const rawText = messageElement.getAttribute("data-raw");
@@ -48,9 +72,7 @@ function fallbackCopyToClipboard(text) {
 }
 
 function branchHere(element) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
   if (!messageElement) return;
 
   const index = messageElement.getAttribute("data-index");
@@ -69,11 +91,7 @@ function branchHere(element) {
   }
 
   branchIndexInput.value = index;
-
-  // Trigger any 'change' or 'input' events Gradio might be listening for
-  const event = new Event("input", { bubbles: true });
-  branchIndexInput.dispatchEvent(event);
-
+  dispatchGradioInput(branchIndexInput);
   branchButton.click();
 }
 
@@ -82,9 +100,7 @@ function branchHere(element) {
 // -------------------------------------------------
 
 function editHere(buttonElement) {
-  if (!buttonElement) return;
-
-  const messageElement = buttonElement.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(buttonElement);
   if (!messageElement) return;
 
   const messageBody = messageElement.querySelector(".message-body");
@@ -97,12 +113,7 @@ function editHere(buttonElement) {
     return;
   }
 
-  // Determine role based on message element - handle different chat modes
-  const isUserMessage = messageElement.classList.contains("user-message") ||
-                       messageElement.querySelector(".text-you") !== null ||
-                       messageElement.querySelector(".circle-you") !== null;
-
-  startEditing(messageElement, messageBody, isUserMessage);
+  startEditing(messageElement, messageBody, isUserRole(messageElement));
 }
 
 function startEditing(messageElement, messageBody, isUserMessage) {
@@ -209,30 +220,22 @@ function submitMessageEdit(index, newText, isUserMessage) {
   editTextInput.value = newText;
   editRoleInput.value = isUserMessage ? "user" : "assistant";
 
-  editIndexInput.dispatchEvent(new Event("input", { bubbles: true }));
-  editTextInput.dispatchEvent(new Event("input", { bubbles: true }));
-  editRoleInput.dispatchEvent(new Event("input", { bubbles: true }));
+  dispatchGradioInput(editIndexInput);
+  dispatchGradioInput(editTextInput);
+  dispatchGradioInput(editRoleInput);
 
   editButton.click();
   return true;
 }
 
 function navigateVersion(element, direction) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
   if (!messageElement) return;
 
   const index = messageElement.getAttribute("data-index");
   if (!index) return;
 
-  // Determine role based on message element classes
-  let role = "assistant"; // Default role
-  if (messageElement.classList.contains("user-message") ||
-      messageElement.querySelector(".text-you") ||
-      messageElement.querySelector(".circle-you")) {
-    role = "user";
-  }
+  const role = isUserRole(messageElement) ? "user" : "assistant";
 
   const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
   const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
@@ -248,11 +251,9 @@ function navigateVersion(element, direction) {
   directionInput.value = direction;
   roleInput.value = role;
 
-  // Trigger 'input' events for Gradio to pick up changes
-  const event = new Event("input", { bubbles: true });
-  indexInput.dispatchEvent(event);
-  directionInput.dispatchEvent(event);
-  roleInput.dispatchEvent(event);
+  dispatchGradioInput(indexInput);
+  dispatchGradioInput(directionInput);
+  dispatchGradioInput(roleInput);
 
   navigateButton.click();
 }
@@ -313,7 +314,7 @@ function handleMorphdomUpdate(data) {
 
 function applyMorphdomUpdate(data) {
   // Determine target element and use it as query scope
-  var target_element, target_html;
+  let target_element, target_html;
   if (data.last_message_only) {
     const childNodes = document.getElementsByClassName("messages")[0].childNodes;
     target_element = childNodes[childNodes.length - 1];
diff --git a/js/main.js b/js/main.js
index f05f93c6..cba4c903 100644
--- a/js/main.js
+++ b/js/main.js
@@ -4,8 +4,9 @@
 
 // Sync highlight.js theme with the actual Gradio theme
 var defined_hljs_css = document.body.classList.contains("dark") ? "file/css/highlightjs/github-dark.min.css" : "file/css/highlightjs/github.min.css";
-if (document.getElementById("highlight-css").getAttribute("href") !== defined_hljs_css) {
-  document.getElementById("highlight-css").setAttribute("href", defined_hljs_css);
+var hljsCssElement = document.getElementById("highlight-css");
+if (hljsCssElement.getAttribute("href") !== defined_hljs_css) {
+  hljsCssElement.setAttribute("href", defined_hljs_css);
 }
 
 let main_parent = document.getElementById("chat-tab").parentNode;
@@ -49,21 +50,18 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
 //------------------------------------------------
 
 // --- Helper functions --- //
-function isModifiedKeyboardEvent() {
-  return (event instanceof KeyboardEvent &&
-    event.shiftKey ||
-    event.ctrlKey ||
-    event.altKey ||
-    event.metaKey);
+function isModifiedKeyboardEvent(event) {
+  return event instanceof KeyboardEvent &&
+    (event.shiftKey || event.ctrlKey || event.altKey || event.metaKey);
 }
 
-function isFocusedOnEditableTextbox() {
+function isFocusedOnEditableTextbox(event) {
   if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") {
     return !!event.target.value;
   }
+  return false;
 }
 
-let previousTabId = "chat-tab-button";
 document.addEventListener("keydown", function(event) {
   // Stop generation on Esc pressed
   if (event.key === "Escape") {
@@ -117,14 +115,14 @@ document.addEventListener("keydown", function(event) {
   }
 
   // --- Simple version navigation --- //
-  if (!isFocusedOnEditableTextbox()) {
+  if (!isFocusedOnEditableTextbox(event)) {
     // Version navigation on Arrow keys (horizontal)
-    if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") {
+    if (!isModifiedKeyboardEvent(event) && event.key === "ArrowLeft") {
       event.preventDefault();
       navigateLastAssistantMessage("left");
     }
 
-    else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") {
+    else if (!isModifiedKeyboardEvent(event) && event.key === "ArrowRight") {
       event.preventDefault();
       if (!navigateLastAssistantMessage("right")) {
         // If can't navigate right (last version), regenerate
@@ -159,9 +157,8 @@ targetElement.addEventListener("scroll", function() {
   let diff = targetElement.scrollHeight - targetElement.clientHeight;
   let isAtBottomNow = Math.abs(targetElement.scrollTop - diff) <= 10 || diff <= 0;
 
-  // Add scrolling class to disable hover effects
   if (window.isScrolled || !isAtBottomNow) {
-    targetElement.classList.add("scrolling");
+    targetElement.classList.add("scrolling"); // Disables hover effects during scroll
   }
 
   if(isAtBottomNow) {
@@ -202,12 +199,8 @@ const observer = new MutationObserver(function() {
 });
 
 // Only watch for attribute changes on targetElement (e.g. _generating class)
-const config = {
-  attributes: true
-};
-
 // Start observing the target element
-observer.observe(targetElement, config);
+observer.observe(targetElement, { attributes: true });
 
 //------------------------------------------------
 // Handle syntax highlighting / LaTeX
@@ -228,7 +221,7 @@ window.doSyntaxHighlighting = function() {
   if (messageBodies.length > 0) {
     let hasSeenVisible = false;
 
-    // Go from last message to first
+    // Go from last message to first so we can early-exit once past visible area
     for (let i = messageBodies.length - 1; i >= 0; i--) {
       const messageBody = messageBodies[i];
 
@@ -243,8 +236,8 @@ window.doSyntaxHighlighting = function() {
           codeBlock.classList.add("pretty_scrollbar");
         });
 
-        // Only render math in visible elements
         const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
+        // Only render math in individually visible containers (the outer check is on the message body)
         mathContainers.forEach(container => {
           if (isElementVisibleOnScreen(container)) {
             renderMathInElement(container, {
@@ -271,7 +264,7 @@ const doSyntaxHighlighting = window.doSyntaxHighlighting;
 // Add some scrollbars
 //------------------------------------------------
 const scrollbarElements = document.querySelectorAll(".add_scrollbar textarea, .add_scrollbar .drag-drop-list");
-for(i = 0; i < scrollbarElements.length; i++) {
+for(let i = 0; i < scrollbarElements.length; i++) {
   scrollbarElements[i].classList.remove("scroll-hide");
   scrollbarElements[i].classList.add("pretty_scrollbar");
   scrollbarElements[i].style.resize = "none";
@@ -298,13 +291,13 @@ if (toolsInfo) {
 // Remove some backgrounds
 //------------------------------------------------
 const noBackgroundelements = document.querySelectorAll(".no-background");
-for(i = 0; i < noBackgroundelements.length; i++) {
+for(let i = 0; i < noBackgroundelements.length; i++) {
   noBackgroundelements[i].parentNode.style.border = "none";
   noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = "center";
 }
 
 const slimDropdownElements = document.querySelectorAll(".slim-dropdown");
-for (i = 0; i < slimDropdownElements.length; i++) {
+for (let i = 0; i < slimDropdownElements.length; i++) {
   const parentNode = slimDropdownElements[i].parentNode;
   parentNode.style.background = "transparent";
   parentNode.style.border = "0";
@@ -374,49 +367,43 @@ button.addEventListener("click", function () {
   }
 });
 
-// Add event listener for mouseleave on the button
-button.addEventListener("mouseleave", function () {
-  // Delay to prevent menu hiding when the mouse leaves the button into the menu
+// Delay to prevent menu hiding when the mouse leaves the button or menu
+function delayedHideMenu() {
   setTimeout(function () {
     if (!isMouseOverButtonOrMenu()) {
       hideMenu();
     }
   }, 100);
-});
+}
 
+// Add event listener for mouseleave on the button
+button.addEventListener("mouseleave", delayedHideMenu);
 // Add event listener for mouseleave on the menu
-menu.addEventListener("mouseleave", function () {
-  // Delay to prevent menu hide when the mouse leaves the menu into the button
-  setTimeout(function () {
-    if (!isMouseOverButtonOrMenu()) {
-      hideMenu();
-    }
-  }, 100);
-});
+menu.addEventListener("mouseleave", delayedHideMenu);
 
 // Add event listener for click anywhere in the document
 document.addEventListener("click", function (event) {
-  const target = event.target;
-
   // Check if the click is outside the button/menu and the menu is visible
   if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") {
     hideMenu();
   }
 
-  if (event.target.classList.contains("pfp_character")) {
+  const target = event.target;
+
+  if (target.classList.contains("pfp_character")) {
     toggleBigPicture();
   }
 
   // Handle sidebar clicks on mobile
   if (isMobile()) {
-  // Check if the click did NOT originate from any of the specified toggle buttons or elements
+    // Check if the click did NOT originate from any of the specified toggle buttons or elements
     if (
       target.closest("#navigation-toggle") !== navigationToggle &&
-    target.closest("#past-chats-toggle") !== pastChatsToggle &&
-    target.closest("#chat-controls-toggle") !== chatControlsToggle &&
-    target.closest(".header_bar") !== headerBar &&
-    target.closest("#past-chats-row") !== pastChatsRow &&
-    target.closest("#chat-controls") !== chatControlsRow
+      target.closest("#past-chats-toggle") !== pastChatsToggle &&
+      target.closest("#chat-controls-toggle") !== chatControlsToggle &&
+      target.closest(".header_bar") !== headerBar &&
+      target.closest("#past-chats-row") !== pastChatsRow &&
+      target.closest("#chat-controls") !== chatControlsRow
     ) {
       handleIndividualSidebarClose(event);
     }
@@ -433,27 +420,19 @@ document.getElementById("chat-input-row").classList.add("chat-input-positioned")
 //------------------------------------------------
 const chatTextArea = document.getElementById("chat-input").querySelector("textarea");
 
-function respondToChatInputVisibility(element, callback) {
-  var options = {
-    root: document.documentElement,
-  };
-
-  var observer = new IntersectionObserver((entries, observer) => {
+function focusOnVisible(element) {
+  var observer = new IntersectionObserver((entries) => {
     entries.forEach(entry => {
-      callback(entry.intersectionRatio > 0);
+      if (entry.intersectionRatio > 0) {
+        element.focus();
+      }
     });
-  }, options);
+  }, { root: document.documentElement });
 
   observer.observe(element);
 }
 
-function handleChatInputVisibilityChange(isVisible) {
-  if (isVisible) {
-    chatTextArea.focus();
-  }
-}
-
-respondToChatInputVisibility(chatTextArea, handleChatInputVisibilityChange);
+focusOnVisible(chatTextArea);
 
 //------------------------------------------------
 // Show enlarged character picture when the profile
@@ -463,8 +442,7 @@ let bigPictureVisible = false;
 
 function addBigPicture() {
   var imgElement = document.createElement("img");
-  var timestamp = new Date().getTime();
-  imgElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
+  imgElement.src = getProfilePictureUrl();
   imgElement.classList.add("bigProfilePicture");
   imgElement.addEventListener("load", function () {
     this.style.visibility = "visible";
@@ -478,9 +456,8 @@ function addBigPicture() {
 }
 
 function deleteBigPicture() {
-  var bigProfilePictures = document.querySelectorAll(".bigProfilePicture");
-  bigProfilePictures.forEach(function (element) {
-    element.parentNode.removeChild(element);
+  document.querySelectorAll(".bigProfilePicture").forEach(function (element) {
+    element.remove();
   });
 }
 
@@ -494,44 +471,11 @@ function toggleBigPicture() {
   }
 }
 
-//------------------------------------------------
-// Handle the chat input box growth
-//------------------------------------------------
-
-// Cache DOM elements
-const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
-const chatInput = document.querySelector("#chat-input textarea");
-
-// Variables to store current dimensions
-let currentChatInputHeight = chatInput.clientHeight;
-
 //------------------------------------------------
 // Focus on the rename text area when it becomes visible
 //------------------------------------------------
 const renameTextArea = document.getElementById("rename-row").querySelector("textarea");
-
-function respondToRenameVisibility(element, callback) {
-  var options = {
-    root: document.documentElement,
-  };
-
-  var observer = new IntersectionObserver((entries, observer) => {
-    entries.forEach(entry => {
-      callback(entry.intersectionRatio > 0);
-    });
-  }, options);
-
-  observer.observe(element);
-}
-
-
-function handleVisibilityChange(isVisible) {
-  if (isVisible) {
-    renameTextArea.focus();
-  }
-}
-
-respondToRenameVisibility(renameTextArea, handleVisibilityChange);
+focusOnVisible(renameTextArea);
 
 //------------------------------------------------
 // Adjust the chat tab margin if no extension UI
@@ -737,21 +681,21 @@ function handleIndividualSidebarClose(event) {
 
   // Close navigation bar if click is outside and it is open
   if (!headerBar.contains(target) && !headerBar.classList.contains("sidebar-hidden")) {
-    toggleSidebar(headerBar, navigationToggle, true);
+    toggleSidebar(headerBar, navigationToggle);
   }
 
   // Close past chats row if click is outside and it is open
   if (!pastChatsRow.contains(target) && !pastChatsRow.classList.contains("sidebar-hidden")) {
-    toggleSidebar(pastChatsRow, pastChatsToggle, true);
+    toggleSidebar(pastChatsRow, pastChatsToggle);
   }
 
   // Close chat controls row if click is outside and it is open
   if (!chatControlsRow.contains(target) && !chatControlsRow.classList.contains("sidebar-hidden")) {
-    toggleSidebar(chatControlsRow, chatControlsToggle, true);
+    toggleSidebar(chatControlsRow, chatControlsToggle);
   }
 }
 
-function toggleSidebar(sidebar, toggle, forceClose = false) {
+function toggleSidebar(sidebar, toggle) {
   const isCurrentlyHidden = sidebar.classList.contains("sidebar-hidden");
   const shouldClose = !isCurrentlyHidden;
 
@@ -776,11 +720,6 @@ function toggleSidebar(sidebar, toggle, forceClose = false) {
     toggle.classList.toggle("chat-controls-open", !shouldClose);
     toggle.innerHTML = shouldClose ? leftArrowSVG : rightArrowSVG;
   }
-
-  // Mobile handling
-  if (isMobile()) {
-    sidebar.classList.toggle("sidebar-shown", !shouldClose);
-  }
 }
 
 // Function to check if the device is mobile
@@ -840,17 +779,17 @@ pastChatsToggle.addEventListener("click", () => {
   const isCurrentlyOpen = !pastChatsRow.classList.contains("sidebar-hidden");
   toggleSidebar(pastChatsRow, pastChatsToggle);
 
-  // On desktop, open/close both sidebars at the same time
+  // On desktop, sync both sidebars together
   if (!isMobile()) {
     if (isCurrentlyOpen) {
       // If we just closed the left sidebar, also close the right sidebar
       if (!chatControlsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(chatControlsRow, chatControlsToggle, true);
+        toggleSidebar(chatControlsRow, chatControlsToggle);
       }
     } else {
       // If we just opened the left sidebar, also open the right sidebar
       if (chatControlsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(chatControlsRow, chatControlsToggle, false);
+        toggleSidebar(chatControlsRow, chatControlsToggle);
       }
     }
   }
@@ -860,17 +799,17 @@ chatControlsToggle.addEventListener("click", () => {
   const isCurrentlyOpen = !chatControlsRow.classList.contains("sidebar-hidden");
   toggleSidebar(chatControlsRow, chatControlsToggle);
 
-  // On desktop, open/close both sidebars at the same time
+  // On desktop, sync both sidebars together
   if (!isMobile()) {
     if (isCurrentlyOpen) {
       // If we just closed the right sidebar, also close the left sidebar
       if (!pastChatsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(pastChatsRow, pastChatsToggle, true);
+        toggleSidebar(pastChatsRow, pastChatsToggle);
       }
     } else {
       // If we just opened the right sidebar, also open the left sidebar
       if (pastChatsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(pastChatsRow, pastChatsToggle, false);
+        toggleSidebar(pastChatsRow, pastChatsToggle);
       }
     }
   }
@@ -890,7 +829,7 @@ if (isMobile()) {
   const textarea = document.querySelector("#chat-input textarea");
 
   if (textarea) {
-    // Simulate adding and removing a newline
+    // Force textarea height recalculation by simulating content change
     textarea.value += "\n";
     textarea.dispatchEvent(new Event("input", { bubbles: true }));
     textarea.value = textarea.value.slice(0, -1);
diff --git a/js/save_files.js b/js/save_files.js
index bdb0e334..c3cbf9ff 100644
--- a/js/save_files.js
+++ b/js/save_files.js
@@ -1,10 +1,9 @@
 // Functions for downloading JSON files
 function getCurrentTimestamp() {
   const now = new Date();
-  const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert to milliseconds
+  const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert minutes to milliseconds
   const localTime = new Date(now.getTime() - timezoneOffset);
-  const formattedTimestamp = localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
-  return formattedTimestamp;
+  return localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
 }
 
 function saveFile(contents, filename) {
@@ -18,23 +17,18 @@ function saveFile(contents, filename) {
 }
 
 function saveHistory(history, character, mode) {
-  let path = null;
+  let path;
 
   if (["chat", "chat-instruct"].includes(mode) && character && character.trim() !== "") {
     path = `history_${character}_${getCurrentTimestamp()}.json`;
   } else {
-    try {
-      path = `history_${mode}_${getCurrentTimestamp()}.json`;
-    } catch (error) {
-      path = `history_${getCurrentTimestamp()}.json`;
-    }
+    path = `history_${mode || "unknown"}_${getCurrentTimestamp()}.json`;
   }
+
   saveFile(history, path);
 }
 
 function saveSession(session) {
-  let path = null;
-
-  path = `session_${getCurrentTimestamp()}.json`;
+  const path = `session_${getCurrentTimestamp()}.json`;
   saveFile(session, path);
 }
diff --git a/js/show_controls.js b/js/show_controls.js
index ff513395..d5642dc4 100644
--- a/js/show_controls.js
+++ b/js/show_controls.js
@@ -1,13 +1,11 @@
-const chatParent = document.querySelector(".chat-parent");
-
 function toggle_controls(value) {
+  const navToggle = document.getElementById("navigation-toggle");
+  const pastChatsToggle = document.getElementById("past-chats-toggle");
   const extensions = document.querySelector("#extensions");
+  const galleryExtension = document.getElementById("gallery-extension");
 
   if (value) {
     // SHOW MODE: Click toggles to show hidden sidebars
-    const navToggle = document.getElementById("navigation-toggle");
-    const pastChatsToggle = document.getElementById("past-chats-toggle");
-
     if (navToggle && document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
       navToggle.click();
     }
@@ -19,17 +17,11 @@ function toggle_controls(value) {
     if (extensions) {
       extensions.style.display = "inherit";
     }
-
-    let gallery_element = document.getElementById("gallery-extension");
-    if (gallery_element) {
-      gallery_element.style.display = "block";
+    if (galleryExtension) {
+      galleryExtension.style.display = "block";
     }
-
   } else {
     // HIDE MODE: Click toggles to hide visible sidebars
-    const navToggle = document.getElementById("navigation-toggle");
-    const pastChatsToggle = document.getElementById("past-chats-toggle");
-
     if (navToggle && !document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
       navToggle.click();
     }
@@ -41,5 +33,8 @@ function toggle_controls(value) {
     if (extensions) {
       extensions.style.display = "none";
     }
+    if (galleryExtension) {
+      galleryExtension.style.display = "none";
+    }
   }
 }
diff --git a/js/switch_tabs.js b/js/switch_tabs.js
index 36e5736b..a1b44ef3 100644
--- a/js/switch_tabs.js
+++ b/js/switch_tabs.js
@@ -2,17 +2,9 @@ function scrollToTop() {
   window.scrollTo({ top: 0 });
 }
 
-function findButtonsByText(buttonText) {
-  const buttons = document.getElementsByTagName("button");
-  const matchingButtons = [];
-
-  for (let i = 0; i < buttons.length; i++) {
-    if (buttons[i].textContent.trim() === buttonText) {
-      matchingButtons.push(buttons[i]);
-    }
-  }
-
-  return matchingButtons;
+function findButtonsByText(buttonText, container = document) {
+  return Array.from(container.getElementsByTagName("button"))
+    .filter(btn => btn.textContent.trim() === buttonText);
 }
 
 function switch_to_chat() {
@@ -39,13 +31,9 @@ function switch_to_character() {
 
 function switch_to_image_ai_generate() {
   const container = document.querySelector("#image-ai-tab");
-  const buttons = container.getElementsByTagName("button");
-
-  for (let i = 0; i < buttons.length; i++) {
-    if (buttons[i].textContent.trim() === "Generate") {
-      buttons[i].click();
-      break;
-    }
+  const generateBtn = findButtonsByText("Generate", container)[0];
+  if (generateBtn) {
+    generateBtn.click();
   }
 
   scrollToTop();
diff --git a/js/update_big_picture.js b/js/update_big_picture.js
index ec51d63b..8f638c99 100644
--- a/js/update_big_picture.js
+++ b/js/update_big_picture.js
@@ -1,7 +1,6 @@
 function updateBigPicture() {
   var existingElement = document.querySelector(".bigProfilePicture");
   if (existingElement) {
-    var timestamp = new Date().getTime();
-    existingElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
+    existingElement.src = getProfilePictureUrl();
   }
 }
diff --git a/modules/extensions.py b/modules/extensions.py
index 09db9f40..afe847f0 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -191,21 +191,19 @@ def _apply_custom_generate_reply():
 
 
 def _apply_custom_css():
-    all_css = ''
-    for extension, _ in iterator():
-        if hasattr(extension, 'custom_css'):
-            all_css += getattr(extension, 'custom_css')()
-
-    return all_css
+    return ''.join(
+        getattr(extension, 'custom_css')()
+        for extension, _ in iterator()
+        if hasattr(extension, 'custom_css')
+    )
 
 
 def _apply_custom_js():
-    all_js = ''
-    for extension, _ in iterator():
-        if hasattr(extension, 'custom_js'):
-            all_js += getattr(extension, 'custom_js')()
-
-    return all_js
+    return ''.join(
+        getattr(extension, 'custom_js')()
+        for extension, _ in iterator()
+        if hasattr(extension, 'custom_js')
+    )
 
 
 def create_extensions_block():

From 71c1a52afe54ab599ab5849ae80f1d5a3a72fb5a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 30 Mar 2026 20:49:38 -0700
Subject: [PATCH 12/76] API: Implement echo + logprobs for /v1/completions
 endpoint

---
 modules/api/completions.py  | 299 ++++++++++++++++++++++++++++++------
 modules/exllamav3.py        |  26 +++-
 modules/llama_cpp_server.py |  39 ++++-
 3 files changed, 309 insertions(+), 55 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 8948bb86..587ad6ea 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -39,6 +39,129 @@ def load_chat_template_file(filepath):
     return text
 
 
+def _first_token_display_str(token_id, prompt, tokenizer):
+    """Return the display string for the first prompt token.
+
+    Returns empty string for BOS or tokens that don't appear at the start
+    of the prompt text, so they don't shift text_offset for subsequent tokens.
+    """
+    token_id = int(token_id)
+    bos_id = getattr(tokenizer, 'bos_token_id', None)
+    if bos_id is not None and token_id == bos_id:
+        return ""
+
+    import torch
+    tok = tokenizer.decode(torch.tensor([token_id]))
+    if not prompt.startswith(tok):
+        return ""
+
+    return tok
+
+
+def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
+    """Compute logprob entries for prompt tokens via a forward pass.
+
+    Returns a list of logprob entries in the standard format.
+    The first token gets a null entry (no conditioning context).
+
+    Supported for HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
+    via a single forward pass, and for llama.cpp via the server's
+    prompt_logprobs parameter. Returns [] for unsupported loaders.
+    """
+    if input_ids is None:
+        input_ids = encode(prompt)  # (1, seq_len) tensor or array
+
+    token_ids = input_ids[0]
+    n_tokens = len(token_ids)
+
+    if n_tokens == 0:
+        return []
+
+    loader = shared.args.loader
+    model = shared.model
+
+    if loader == 'llama.cpp':
+        return model.get_prompt_logprob_entries(token_ids, max(logprobs_count, 1), prompt=prompt)
+
+    first_token_str = _first_token_display_str(token_ids[0], prompt, shared.tokenizer)
+
+    if n_tokens <= 1:
+        return [{"token": first_token_str, "null_logprob": True}]
+
+    import torch
+
+    if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
+        # Native ExLlamav3: call the underlying Model.forward() directly
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        with torch.no_grad():
+            logits = model.model.forward(
+                input_ids=input_ids_tensor,
+                params={
+                    "attn_mode": "flash_attn",
+                    "cache": model.cache,
+                    "past_len": 0,
+                    "batch_shape": (1, model.max_tokens),
+                }
+            ).float().cpu()
+
+    elif hasattr(model, 'forward'):
+        # HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        if hasattr(model, 'device'):
+            input_ids_tensor = input_ids_tensor.to(model.device)
+        with torch.no_grad():
+            # Pass labels to ensure logits are returned for ALL positions,
+            # not just the last token (some HF wrappers like ExLlamav3_HF
+            # only compute the last-token logits when labels are absent).
+            outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor)
+            logits = outputs.logits.float().cpu()
+
+    else:
+        return []
+
+    entries = [{"token": first_token_str, "null_logprob": True}]
+
+    # Batch logsumexp and topk as single operations across all positions
+    # to avoid per-position kernel launch overhead.
+    prompt_logits = logits[0, :n_tokens - 1]  # positions 0..n-2 predict tokens 1..n-1
+    k = min(logprobs_count, prompt_logits.shape[-1])
+    all_top_values, all_top_indices = torch.topk(prompt_logits, k=k, dim=-1)
+    all_lse = torch.logsumexp(prompt_logits, dim=-1)
+    all_top_log_probs = all_top_values - all_lse.unsqueeze(-1)
+
+    # Batch-decode all unique token IDs to avoid O(N*k) individual decode calls
+    unique_ids = set(int(tid) for tid in token_ids[1:])
+    unique_ids.update(int(tid) for tid in all_top_indices.flatten().tolist())
+
+    decoded_strs = {tid: shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids}
+
+    for i in range(1, n_tokens):
+        token_id = int(token_ids[i])
+        idx = i - 1
+        top_log_probs = all_top_log_probs[idx]
+        top_ids = all_top_indices[idx].tolist()
+        actual_token_str = decoded_strs[token_id]
+
+        # Build the top list with the actual prompt token guaranteed at front
+        if token_id in top_ids:
+            actual_lp = top_log_probs[top_ids.index(token_id)].item()
+            alternatives = [
+                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                for j in range(k) if top_ids[j] != token_id
+            ]
+        else:
+            actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
+            alternatives = [
+                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                for j in range(k - 1)  # drop lowest to make room
+            ]
+
+        entry = {"top_logprobs": [{"token": actual_token_str, "logprob": actual_lp}] + alternatives}
+        entries.append(entry)
+
+    return entries
+
+
 def _get_raw_logprob_entries(offset=0):
     """Get raw logprob entries from llama.cpp/ExLlamav3 backend, starting from offset.
 
@@ -65,6 +188,21 @@ def _parse_entry_top(entry):
     return entry.get('top_logprobs', entry.get('top_probs', []))
 
 
+def _extract_sampled_token(entry, top):
+    """Get the actually sampled token and its logprob from a logprob entry.
+
+    Uses the entry-level token/logprob when available (the actually sampled
+    token), falling back to top[0] (highest-probability alternative) which
+    may differ with non-greedy sampling.
+    """
+    if 'token' in entry:
+        return entry['token'], entry.get('logprob', entry.get('prob', 0))
+
+    token_str = top[0].get('token', '')
+    token_logprob = top[0].get('logprob', top[0].get('prob', 0))
+    return token_str, token_logprob
+
+
 def format_chat_logprobs(entries):
     """Format logprob entries into OpenAI chat completions logprobs format.
 
@@ -79,9 +217,7 @@ def format_chat_logprobs(entries):
         if not top:
             continue
 
-        chosen = top[0]
-        token_str = chosen.get('token', '')
-        token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+        token_str, token_logprob = _extract_sampled_token(entry, top)
 
         top_list = []
         for item in top:
@@ -118,13 +254,21 @@ def format_completion_logprobs(entries):
     offset = 0
 
     for entry in entries:
+        # Handle null logprob entries (first prompt token with echo)
+        if entry.get("null_logprob"):
+            token_str = entry.get("token", "")
+            tokens.append(token_str)
+            token_logprobs.append(None)
+            top_logprobs.append(None)
+            text_offset.append(offset)
+            offset += len(token_str)
+            continue
+
         top = _parse_entry_top(entry)
         if not top:
             continue
 
-        chosen = top[0]
-        token_str = chosen.get('token', '')
-        token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+        token_str, token_logprob = _extract_sampled_token(entry, top)
 
         tokens.append(token_str)
         token_logprobs.append(token_logprob)
@@ -407,7 +551,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     })
 
     max_tokens = generate_params['max_new_tokens']
-    if max_tokens in [None, 0]:
+    if max_tokens is not None and max_tokens <= 0:
+        raise InvalidRequestError(message="max_tokens must be greater than 0.", param="max_tokens")
+
+    if max_tokens is None:
         generate_params['max_new_tokens'] = 512
         generate_params['auto_max_new_tokens'] = True
 
@@ -652,6 +799,15 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
     # common params
     generate_params = process_parameters(body, is_legacy=is_legacy)
     max_tokens = generate_params['max_new_tokens']
+    if max_tokens is None:
+        generate_params['max_new_tokens'] = 512
+        generate_params['auto_max_new_tokens'] = True
+        max_tokens = 512
+    elif max_tokens < 0:
+        raise InvalidRequestError(message="max_tokens must be greater than or equal to 0.", param="max_tokens")
+    elif max_tokens == 0 and body.get('logprobs') is None:
+        raise InvalidRequestError(message="max_tokens is 0 but no logprobs parameter was specified.", param="max_tokens")
+
     generate_params['stream'] = stream
     if stop_event is not None:
         generate_params['stop_event'] = stop_event
@@ -700,9 +856,17 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                         prompt = decode(prompt)[0]
 
             prefix = prompt if echo else ''
-            token_count = len(encode(prompt)[0])
+            prompt_input_ids = encode(prompt)
+            token_count = len(prompt_input_ids[0])
             total_prompt_token_count += token_count
 
+            # Compute prompt logprobs once per prompt (shared across n_completions)
+            logprobs_val = body.get('logprobs', None)
+            if echo and logprobs_val is not None and logprobs_val >= 0:
+                prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
+            else:
+                prompt_entries = None
+
             original_seed = generate_params.get('seed', -1)
             for _n in range(n_completions):
                 # Increment seed for each completion to ensure diversity (matches llama.cpp native behavior)
@@ -713,29 +877,41 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                     logprob_proc.token_alternatives_history.clear()
 
                 # generate reply #######################################
-                debug_msg({'prompt': prompt, 'generate_params': generate_params})
-                generator = generate_reply(prompt, generate_params, is_chat=False)
-                answer = ''
-
-                for a in generator:
-                    answer = a
-
-                completion_token_count = len(encode(answer)[0])
-                total_completion_token_count += completion_token_count
-                stop_reason = "stop"
-                if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
-                    stop_reason = "length"
-
-                if logprob_proc:
-                    all_entries = []
-                    for alt in logprob_proc.token_alternatives_history:
-                        all_entries.extend(_dict_to_logprob_entries(alt))
-                    completion_logprobs = format_completion_logprobs(all_entries)
-                elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
-                    raw = getattr(shared.model, 'last_completion_probabilities', None)
-                    completion_logprobs = format_completion_logprobs(raw)
+                if max_tokens == 0:
+                    answer = ''
+                    completion_token_count = 0
+                    stop_reason = "stop"
                 else:
-                    completion_logprobs = None
+                    debug_msg({'prompt': prompt, 'generate_params': generate_params})
+                    generator = generate_reply(prompt, generate_params, is_chat=False)
+                    answer = ''
+
+                    for a in generator:
+                        answer = a
+
+                    completion_token_count = len(encode(answer)[0])
+                    stop_reason = "stop"
+                    if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+                        stop_reason = "length"
+
+                total_completion_token_count += completion_token_count
+
+                if max_tokens == 0:
+                    all_entries = []
+                else:
+                    if logprob_proc:
+                        all_entries = []
+                        for alt in logprob_proc.token_alternatives_history:
+                            all_entries.extend(_dict_to_logprob_entries(alt))
+                    elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
+                        all_entries = getattr(shared.model, 'last_completion_probabilities', None) or []
+                    else:
+                        all_entries = []
+
+                if prompt_entries:
+                    all_entries = prompt_entries + all_entries
+
+                completion_logprobs = format_completion_logprobs(all_entries) if all_entries else None
 
                 respi = {
                     "index": choice_index,
@@ -775,7 +951,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                 raise InvalidRequestError(message="API Batched generation not yet supported.", param=prompt_str)
 
         prefix = prompt if echo else ''
-        token_count = len(encode(prompt)[0])
+        prompt_input_ids = encode(prompt)
+        token_count = len(prompt_input_ids[0])
 
         # Check if usage should be included in streaming chunks per OpenAI spec
         stream_options = body.get('stream_options')
@@ -808,37 +985,57 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
 
             return chunk
 
+        logprobs_val = body.get('logprobs', None)
+        if echo and logprobs_val is not None and logprobs_val >= 0:
+            prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
+            prompt_logprobs_formatted = format_completion_logprobs(prompt_entries) if prompt_entries else None
+        else:
+            prompt_logprobs_formatted = None
+
+        # Clear stale logprobs from any previous request before building the
+        # first chunk, so text_streaming_chunk doesn't pick up old data.
+        if hasattr(shared.model, 'last_completion_probabilities'):
+            shared.model.last_completion_probabilities = []
+        cmpl_logprobs_offset[0] = 0
+
         chunk = text_streaming_chunk(prefix)
+        if prompt_logprobs_formatted is not None:
+            chunk[resp_list][0]["logprobs"] = prompt_logprobs_formatted
         if include_usage:
             chunk['usage'] = None
         yield chunk
 
         # generate reply #######################################
-        debug_msg({'prompt': prompt, 'generate_params': generate_params})
-        generator = generate_reply(prompt, generate_params, is_chat=False)
-        answer = ''
-        seen_content = ''
-        completion_token_count = 0
+        if max_tokens == 0:
+            answer = ''
+            completion_token_count = 0
+            stop_reason = "stop"
+        else:
+            debug_msg({'prompt': prompt, 'generate_params': generate_params})
+            generator = generate_reply(prompt, generate_params, is_chat=False)
+            answer = ''
+            seen_content = ''
+            completion_token_count = 0
 
-        for a in generator:
-            answer = a
+            for a in generator:
+                answer = a
 
-            len_seen = len(seen_content)
-            new_content = answer[len_seen:]
+                len_seen = len(seen_content)
+                new_content = answer[len_seen:]
 
-            if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
-                continue
+                if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
+                    continue
 
-            seen_content = answer
-            chunk = text_streaming_chunk(new_content)
-            if include_usage:
-                chunk['usage'] = None
-            yield chunk
+                seen_content = answer
+                chunk = text_streaming_chunk(new_content)
+                if include_usage:
+                    chunk['usage'] = None
+                yield chunk
 
-        completion_token_count = len(encode(answer)[0])
-        stop_reason = "stop"
-        if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
-            stop_reason = "length"
+            completion_token_count = len(encode(answer)[0])
+            stop_reason = "stop"
+            if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+                stop_reason = "length"
 
         chunk = text_streaming_chunk(suffix)
         chunk[resp_list][0]["finish_reason"] = stop_reason
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index f873503a..3782a693 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -489,15 +489,35 @@ class Exllamav3Model:
             return
 
         id_to_piece = self.tokenizer.get_id_to_piece_list(True)
+        sampled_ids = result.get("token_ids")    # (batch, seq_len) - actually sampled tokens
+        sampled_probs = result.get("token_probs")  # (batch, seq_len) - their probabilities
+
+        def _piece(tid):
+            s = id_to_piece[tid] if tid < len(id_to_piece) else f"<{tid}>"
+            return s.replace('\u2581', ' ')
+
+        def _logprob(prob):
+            return math.log(prob) if prob > 0 else float("-inf")
+
         # top_k_tokens shape: (batch, seq_len, k), top_k_probs same
         for seq_idx in range(top_k_tokens.shape[1]):
             entry = {"top_logprobs": []}
             for k_idx in range(top_k_tokens.shape[2]):
                 token_id = top_k_tokens[0, seq_idx, k_idx].item()
                 prob = top_k_probs[0, seq_idx, k_idx].item()
-                token_str = id_to_piece[token_id] if token_id < len(id_to_piece) else f"<{token_id}>"
-                logprob = math.log(prob) if prob > 0 else float("-inf")
-                entry["top_logprobs"].append({"token": token_str, "logprob": logprob})
+                entry["top_logprobs"].append({"token": _piece(token_id), "logprob": _logprob(prob)})
+
+            # Record the actually sampled token at the entry level so
+            # format_completion_logprobs uses it instead of top_logprobs[0]
+            # (they differ with non-greedy sampling).
+            if sampled_ids is not None:
+                sid = sampled_ids[0, seq_idx].item()
+                entry["token"] = _piece(sid)
+                if sampled_probs is not None:
+                    entry["logprob"] = _logprob(sampled_probs[0, seq_idx].item())
+                else:
+                    entry["logprob"] = None
+
             self.last_completion_probabilities.append(entry)
 
     def generate(self, prompt, state):
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index fa968be1..34080466 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -310,8 +310,45 @@ class LlamaServer:
         else:
             raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
 
+    def get_prompt_logprob_entries(self, token_ids, n_probs=5, prompt=""):
+        """Get logprob entries for prompt tokens via a single n_predict=0 request.
+
+        Requires llama.cpp server with prompt_logprobs support.
+        Returns entries in the standard format for format_completion_logprobs().
+        """
+        token_ids_list = token_ids.tolist() if hasattr(token_ids, 'tolist') else list(token_ids)
+
+        url = f"http://127.0.0.1:{self.port}/completion"
+        payload = {
+            "prompt": token_ids_list,
+            "n_predict": 0,
+            "n_probs": n_probs,
+            "prompt_logprobs": True,
+            "stream": False,
+            "cache_prompt": False,
+        }
+
+        response = self.session.post(url, json=payload)
+        result = response.json()
+
+        prompt_probs = result.get("prompt_probabilities", [])
+        if not prompt_probs:
+            return []
+
+        # Null first token (no conditioning context); use empty string for BOS
+        # or tokens that don't appear at the start of the prompt text.
+        first_token_str = self.decode([token_ids_list[0]])
+        if self.bos_token and first_token_str == self.bos_token:
+            first_token_str = ""
+        elif not prompt.startswith(first_token_str):
+            first_token_str = ""
+
+        entries = [{"token": first_token_str, "null_logprob": True}]
+        entries.extend(prompt_probs)
+        return entries
+
     def _get_vocabulary_size(self):
-        """Get and store the model's maximum context length."""
+        """Get and store the model's vocabulary size."""
         url = f"http://127.0.0.1:{self.port}/v1/models"
         response = self.session.get(url).json()
 

From 328534b762f22c82b09babf6b04e289eab4a7fde Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 12:51:07 -0700
Subject: [PATCH 13/76] Update llama.cpp

---
 requirements/full/requirements.txt                   | 8 ++++----
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 8 ++++----
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 12 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 6e11dd2f..57991c9a 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c964eff6..bb47ea4b 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b1dd6a4f..5750b109 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 4d03d280..d8302d3d 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 9d41d069..d3a5c008 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,7 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index ff80b6c8..1180b42d 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 318044da..57aa6262 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 1676bffb..894c9199 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 27fc2da8..32b9727f 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 0bbdd30a..73b72832 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index c3ae3c57..ad96bbe2 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index e646c04c..a5df3ad4 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 4073164be0b305d8ac4a01d4259448370d009a99 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 19:08:37 -0700
Subject: [PATCH 14/76] Fix ExLlamav3 OOM on prompt logprobs and qwen3_5_moe HF
 compat

---
 modules/api/completions.py | 13 +++++--------
 modules/exllamav3.py       | 33 ++++-----------------------------
 modules/exllamav3_hf.py    | 32 ++++++++------------------------
 3 files changed, 17 insertions(+), 61 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 587ad6ea..a15e1f86 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -91,17 +91,14 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
     import torch
 
     if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
-        # Native ExLlamav3: call the underlying Model.forward() directly
+        # Native ExLlamav3: call the underlying Model.forward() in chunks
+        # to avoid OOM from giant logits tensors (seq_len * vocab_size * 4 bytes)
         input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        input_ids_tensor = input_ids_tensor.view(-1).cpu()
         with torch.no_grad():
             logits = model.model.forward(
-                input_ids=input_ids_tensor,
-                params={
-                    "attn_mode": "flash_attn",
-                    "cache": model.cache,
-                    "past_len": 0,
-                    "batch_shape": (1, model.max_tokens),
-                }
+                input_ids=input_ids_tensor.view(1, -1),
+                params={"attn_mode": "flash_attn_nc"}
             ).float().cpu()
 
     elif hasattr(model, 'forward'):
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 3782a693..7556a908 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -530,39 +530,14 @@ class Exllamav3Model:
     def get_logits(self, token_ids, **kwargs):
         """
         Process a batch of token_ids and return the logits for the last token.
-        This will reset and overwrite the model's cache.
+        Uses flash_attn_nc (no cache) for correct results with recurrent models.
         """
-        # Initialize a single params dictionary that will be updated in-place
-        params = {
-            "cache": self.cache,
-            "reconstruct": False,
-            "attn_mode": "flash_attn",
-            "batch_shape": (1, self.max_tokens),
-            "past_len": 0
-        }
-        params.update(kwargs)
-
-        # Process prefix tokens to fill the cache and generate recurrent state
-        if token_ids.shape[-1] > 1:
-            prefix_ids = token_ids[:, :-1]
-
-            # This forward call updates the 'params' dict with the recurrent state
-            self.model.forward(
-                input_ids=prefix_ids,
-                params=params
-            )
-
-            # Update past_len for the next call
-            params["past_len"] = prefix_ids.shape[-1]
-
-        # Process the last token, now using the state-filled 'params' dict
-        last_token_ids = token_ids[:, -1:]
         logits = self.model.forward(
-            input_ids=last_token_ids,
-            params=params
+            input_ids=token_ids,
+            params={"attn_mode": "flash_attn_nc"}
         )
 
-        return logits.float().cpu()
+        return logits[:, -1:, :].float().cpu()
 
     def encode(self, string, **kwargs):
         add_bos = kwargs.pop('add_bos', True)
diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index e0ad5002..5e634e22 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -26,6 +26,9 @@ except Exception:
 class Exllamav3HF(PreTrainedModel, GenerationMixin):
     def __init__(self, model_dir):
         hf_config = PretrainedConfig.from_pretrained(model_dir)
+        # Ensure text_config is a proper object, not a dict (fixes qwen3_5_moe + transformers compat)
+        if isinstance(getattr(hf_config, 'text_config', None), dict):
+            hf_config.text_config = PretrainedConfig(**hf_config.text_config)
         super().__init__(hf_config)
 
         exl3_config = Config.from_directory(model_dir)
@@ -199,30 +202,11 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
                 }
             ).to(input_ids.device).float()
         else:
-            # Labels path: use cache for cross-chunk attention.
-            tokens_to_process = seq_tensor
-            all_logits = None
-            current_len = 0
-
-            for i in range(0, tokens_to_process.shape[0], max_chunk_size):
-                chunk = tokens_to_process[i:i + max_chunk_size]
-                chunk_logits = self.ex_model.forward(
-                    input_ids=chunk.view(1, -1),
-                    params={
-                        "attn_mode": "flash_attn",
-                        "cache": ex_cache,
-                        "past_len": current_len,
-                        "batch_shape": (1, self.max_tokens),
-                    }
-                ).float()
-                current_len += chunk.shape[0]
-
-                if all_logits is None:
-                    all_logits = chunk_logits
-                else:
-                    all_logits = torch.cat([all_logits, chunk_logits], dim=1)
-
-            logits = all_logits
+            # Labels path: single pass without cache for correct logits
+            logits = self.ex_model.forward(
+                input_ids=seq_tensor.view(1, -1),
+                params={"attn_mode": "flash_attn_nc"}
+            ).float().cpu()
 
         if is_negative:
             self.past_seq_negative = seq_tensor

From a32ce254f275efe473d6624995957b3b6bd51aa1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 20:28:44 -0700
Subject: [PATCH 15/76] Don't pass torch_dtype to transformers, autodetect from
 model config

---
 modules/transformers_loader.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py
index 7f521b8c..5964f012 100644
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@@ -109,7 +109,6 @@ def load_model_HF(model_name):
     params = {
         'low_cpu_mem_usage': True,
         'attn_implementation': shared.args.attn_implementation,
-        'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
     }
 
     if shared.original_args.trust_remote_code:
@@ -120,6 +119,17 @@ def load_model_HF(model_name):
 
     config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.original_args.trust_remote_code)
 
+    # Determine torch_dtype: respect --bf16 flag, otherwise autodetect
+    # from model config, but never allow float32.
+    if shared.args.bf16:
+        params['torch_dtype'] = torch.bfloat16
+    else:
+        dtype = getattr(config, 'torch_dtype', None) or getattr(getattr(config, 'text_config', None), 'torch_dtype', None)
+        if dtype in (torch.float16, torch.bfloat16):
+            params['torch_dtype'] = dtype
+        else:
+            params['torch_dtype'] = torch.float16
+
     if 'chatglm' in model_name.lower():
         LoaderClass = AutoModel
     else:

From c10c6e87ae0b0085b36e7e13269461744ce04ff6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 07:17:27 -0700
Subject: [PATCH 16/76] API: Add token ids to logprobs output

---
 modules/api/completions.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index a15e1f86..453fa07b 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -143,17 +143,17 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         if token_id in top_ids:
             actual_lp = top_log_probs[top_ids.index(token_id)].item()
             alternatives = [
-                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
                 for j in range(k) if top_ids[j] != token_id
             ]
         else:
             actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
             alternatives = [
-                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
                 for j in range(k - 1)  # drop lowest to make room
             ]
 
-        entry = {"top_logprobs": [{"token": actual_token_str, "logprob": actual_lp}] + alternatives}
+        entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives}
         entries.append(entry)
 
     return entries
@@ -239,7 +239,7 @@ def format_chat_logprobs(entries):
 def format_completion_logprobs(entries):
     """Format logprob entries into OpenAI completions logprobs format.
 
-    Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "text_offset"}
+    Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "top_logprobs_ids": [{token_id: prob}], "text_offset"}
     """
     if not entries:
         return None
@@ -247,6 +247,7 @@ def format_completion_logprobs(entries):
     tokens = []
     token_logprobs = []
     top_logprobs = []
+    top_logprobs_ids = []
     text_offset = []
     offset = 0
 
@@ -257,6 +258,7 @@ def format_completion_logprobs(entries):
             tokens.append(token_str)
             token_logprobs.append(None)
             top_logprobs.append(None)
+            top_logprobs_ids.append(None)
             text_offset.append(offset)
             offset += len(token_str)
             continue
@@ -273,21 +275,28 @@ def format_completion_logprobs(entries):
         offset += len(token_str)
 
         top_dict = {}
+        top_dict_ids = {}
         for item in top:
             t = item.get('token', '')
             lp = item.get('logprob', item.get('prob', 0))
             top_dict[t] = lp
+            if 'token_id' in item:
+                top_dict_ids[item['token_id']] = lp
         top_logprobs.append(top_dict)
+        top_logprobs_ids.append(top_dict_ids if top_dict_ids else None)
 
     if not tokens:
         return None
 
-    return {
+    result = {
         "tokens": tokens,
         "token_logprobs": token_logprobs,
         "top_logprobs": top_logprobs,
         "text_offset": text_offset
     }
+    if any(x is not None for x in top_logprobs_ids):
+        result["top_logprobs_ids"] = top_logprobs_ids
+    return result
 
 
 def process_parameters(body, is_legacy=False):

From ea1f8c71f2e92dc9ae230b943c605e43ff5c633c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:30:59 -0300
Subject: [PATCH 17/76] API: Optimize prompt logprobs and refactor ExLlamav3
 forward pass

---
 modules/api/completions.py | 69 ++++++++++++++++++++++++--------------
 modules/exllamav3.py       | 14 ++++++++
 2 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 453fa07b..4eb8fdad 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -90,16 +90,8 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
 
     import torch
 
-    if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
-        # Native ExLlamav3: call the underlying Model.forward() in chunks
-        # to avoid OOM from giant logits tensors (seq_len * vocab_size * 4 bytes)
-        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
-        input_ids_tensor = input_ids_tensor.view(-1).cpu()
-        with torch.no_grad():
-            logits = model.model.forward(
-                input_ids=input_ids_tensor.view(1, -1),
-                params={"attn_mode": "flash_attn_nc"}
-            ).float().cpu()
+    if hasattr(model, 'get_prompt_logits'):
+        logits = model.get_prompt_logits(input_ids)
 
     elif hasattr(model, 'forward'):
         # HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
@@ -111,26 +103,54 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
             # not just the last token (some HF wrappers like ExLlamav3_HF
             # only compute the last-token logits when labels are absent).
             outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor)
-            logits = outputs.logits.float().cpu()
+            logits = outputs.logits  # keep on GPU, (1, seq_len, vocab) in model dtype
+            del outputs
 
     else:
         return []
 
     entries = [{"token": first_token_str, "null_logprob": True}]
 
-    # Batch logsumexp and topk as single operations across all positions
-    # to avoid per-position kernel launch overhead.
-    prompt_logits = logits[0, :n_tokens - 1]  # positions 0..n-2 predict tokens 1..n-1
-    k = min(logprobs_count, prompt_logits.shape[-1])
-    all_top_values, all_top_indices = torch.topk(prompt_logits, k=k, dim=-1)
-    all_lse = torch.logsumexp(prompt_logits, dim=-1)
-    all_top_log_probs = all_top_values - all_lse.unsqueeze(-1)
-
-    # Batch-decode all unique token IDs to avoid O(N*k) individual decode calls
+    logprobs_count = max(logprobs_count, 1)
+    k = min(logprobs_count, logits.shape[-1])
+    chunk_size = 2048
     unique_ids = set(int(tid) for tid in token_ids[1:])
-    unique_ids.update(int(tid) for tid in all_top_indices.flatten().tolist())
 
-    decoded_strs = {tid: shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids}
+    # Process logits in chunks on GPU, only move top-K results to CPU
+    all_top_log_probs_list = []
+    all_top_indices_list = []
+    all_actual_lps = []
+
+    for start in range(0, n_tokens - 1, chunk_size):
+        end = min(start + chunk_size, n_tokens - 1)
+        chunk_logits = logits[0, start:end].float()  # (chunk, vocab) on GPU
+        chunk_lse = torch.logsumexp(chunk_logits, dim=-1)
+        chunk_top_values, chunk_top_indices = torch.topk(chunk_logits, k=k, dim=-1)
+        chunk_top_log_probs = chunk_top_values - chunk_lse.unsqueeze(-1)
+
+        # Compute logprob for actual next tokens in this chunk
+        chunk_top_sets = [set(chunk_top_indices[j].tolist()) for j in range(end - start)]
+        for j in range(end - start):
+            actual_tid = int(token_ids[start + j + 1])
+            if actual_tid not in chunk_top_sets[j]:
+                all_actual_lps.append((chunk_logits[j, actual_tid] - chunk_lse[j]).item())
+            else:
+                all_actual_lps.append(None)  # will use top_log_probs
+
+        all_top_log_probs_list.append(chunk_top_log_probs.cpu())
+        all_top_indices_list.append(chunk_top_indices.cpu())
+        unique_ids.update(int(tid) for tid in chunk_top_indices.flatten().tolist())
+        del chunk_logits, chunk_lse, chunk_top_values
+
+    del logits
+    torch.cuda.empty_cache()
+
+    all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0)
+    all_top_indices = torch.cat(all_top_indices_list, dim=0)
+
+    unique_ids_list = sorted(unique_ids)
+    decoded_list = shared.tokenizer.batch_decode([[tid] for tid in unique_ids_list]) if hasattr(shared.tokenizer, 'batch_decode') else [shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids_list]
+    decoded_strs = dict(zip(unique_ids_list, decoded_list))
 
     for i in range(1, n_tokens):
         token_id = int(token_ids[i])
@@ -139,7 +159,6 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         top_ids = all_top_indices[idx].tolist()
         actual_token_str = decoded_strs[token_id]
 
-        # Build the top list with the actual prompt token guaranteed at front
         if token_id in top_ids:
             actual_lp = top_log_probs[top_ids.index(token_id)].item()
             alternatives = [
@@ -147,10 +166,10 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
                 for j in range(k) if top_ids[j] != token_id
             ]
         else:
-            actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
+            actual_lp = all_actual_lps[idx]
             alternatives = [
                 {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
-                for j in range(k - 1)  # drop lowest to make room
+                for j in range(k - 1)
             ]
 
         entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives}
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 7556a908..e1efbfeb 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -527,6 +527,20 @@ class Exllamav3Model:
 
         return output
 
+    def get_prompt_logits(self, input_ids):
+        """Return logits for all positions via a single no-cache forward pass.
+
+        Used by prompt logprobs computation. Returns (1, seq_len, vocab) on CPU in float32.
+        """
+        import torch
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        input_ids_tensor = input_ids_tensor.view(1, -1).cpu()
+        with torch.no_grad():
+            return self.model.forward(
+                input_ids=input_ids_tensor,
+                params={"attn_mode": "flash_attn_nc"}
+            ).cpu().float()
+
     def get_logits(self, token_ids, **kwargs):
         """
         Process a batch of token_ids and return the logits for the last token.

From c50e17bdbe1da850189188afaf0682a952efa0d1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:49:31 -0300
Subject: [PATCH 18/76] Add dedicated ik portable requirements files and remove
 macOS ik builds

---
 .github/workflows/build-everything-tgw.yml    |  7 ---
 .../build-portable-release-ik-cuda.yml        |  9 ++--
 .../workflows/build-portable-release-ik.yml   | 44 +++----------------
 requirements/portable/requirements_ik.txt     | 27 ++++++++++++
 .../portable/requirements_ik_cpu_only.txt     | 27 ++++++++++++
 .../portable/requirements_ik_cuda131.txt      | 27 ++++++++++++
 6 files changed, 91 insertions(+), 50 deletions(-)
 create mode 100644 requirements/portable/requirements_ik.txt
 create mode 100644 requirements/portable/requirements_ik_cpu_only.txt
 create mode 100644 requirements/portable/requirements_ik_cuda131.txt

diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 4de591f4..40d9db5d 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -96,10 +96,3 @@ jobs:
     with:
       version: ${{ inputs.version }}
       config: 'os:ubuntu-22.04'
-
-  build_release_ik_macos:
-    name: ik macOS
-    uses: ./.github/workflows/build-portable-release-ik.yml
-    with:
-      version: ${{ inputs.version }}
-      config: 'os:macos-14'
diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml
index 40b4b92f..331a7653 100644
--- a/.github/workflows/build-portable-release-ik-cuda.yml
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@@ -138,14 +138,13 @@ jobs:
             # 3. Prepare requirements file based on CUDA version
             cd "text-generation-webui-${VERSION_CLEAN}"
             if [[ "$CUDA_VERSION" == "13.1" ]]; then
-                REQ_FILE="requirements/portable/requirements_cuda131.txt"
+                REQ_FILE="requirements/portable/requirements_ik_cuda131.txt"
             else
-                REQ_FILE="requirements/portable/requirements.txt"
+                REQ_FILE="requirements/portable/requirements_ik.txt"
             fi
 
-            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
-            sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
-            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true
+            # 4. Inject --ik into start scripts
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
 
             # 5. Install packages
             echo "Installing Python packages from $REQ_FILE..."
diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml
index afb2e763..bf54eb0e 100644
--- a/.github/workflows/build-portable-release-ik.yml
+++ b/.github/workflows/build-portable-release-ik.yml
@@ -1,4 +1,4 @@
-name: Build ik CPU and macOS
+name: Build ik CPU
 
 on:
   workflow_dispatch:
@@ -57,7 +57,7 @@ jobs:
         id: set-matrix
         run: |
           $matrix = @{
-              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
+              'os' = @('ubuntu-22.04', 'windows-2022')
               'pyver' = @("3.13")
           }
 
@@ -110,7 +110,6 @@ jobs:
 
             # Define common variables
             VERSION="${{ inputs.version }}"
-            OS_TYPE="${{ matrix.os }}"
 
             # 1. Set platform-specific variables
             if [[ "$RUNNER_OS" == "Windows" ]]; then
@@ -119,21 +118,7 @@ jobs:
                 PIP_PATH="portable_env/python.exe -m pip"
                 PACKAGES_PATH="portable_env/Lib/site-packages"
                 rm start_linux.sh start_macos.sh
-            elif [[ "$RUNNER_OS" == "macOS" ]]; then
-                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
-                    PLATFORM="macos-x86_64"
-                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz"
-                    REQ_TYPE="apple_intel"
-                else
-                    PLATFORM="macos-arm64"
-                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz"
-                    REQ_TYPE="apple_silicon"
-                fi
-                PIP_PATH="portable_env/bin/python -m pip"
-                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
-                rm start_linux.sh start_windows.bat
             else
-                # Linux case
                 PLATFORM="linux-cpu"
                 PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/bin/python -m pip"
@@ -148,30 +133,13 @@ jobs:
             tar -xzf python-build.tar.gz
             mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
 
-            # 3. Prepare requirements file based on platform
+            # 3. Prepare requirements file
             cd "text-generation-webui-${VERSION_CLEAN}"
-
-            # Select requirements file based on platform
-            if [[ "$RUNNER_OS" == "macOS" ]]; then
-                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
-                    REQ_FILE="requirements/portable/requirements_apple_intel.txt"
-                else
-                    REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
-                fi
-            else
-                REQ_FILE="requirements/portable/requirements_cpu_only.txt"
-            fi
-
+            REQ_FILE="requirements/portable/requirements_ik_cpu_only.txt"
             echo "Using requirements file: $REQ_FILE"
 
-            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
-            if [[ "$RUNNER_OS" == "macOS" ]]; then
-                sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
-                sed -i '' 's/--portable/--portable --ik/g' start_macos.sh
-            else
-                sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
-                sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
-            fi
+            # 4. Inject --ik into start scripts
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
 
             # 5. Install packages
             echo "Installing Python packages from $REQ_FILE..."
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
new file mode 100644
index 00000000..2fa037f7
--- /dev/null
+++ b/requirements/portable/requirements_ik.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
new file mode 100644
index 00000000..b43b51c4
--- /dev/null
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# ik_llama.cpp (CPU only)
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
new file mode 100644
index 00000000..12767285
--- /dev/null
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 8f8b57a029715d07ab164aa22a779ea7ea4619f1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:54:20 -0700
Subject: [PATCH 19/76] Update exllamav3

---
 requirements/full/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 57991c9a..5591c9ca 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -44,7 +44,7 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"

From 6a1f720c7bb9aef73c1c7c4e311460174c5255ec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:58:20 -0700
Subject: [PATCH 20/76] Update transformers

---
 requirements/full/requirements.txt               | 2 +-
 requirements/full/requirements_amd.txt           | 2 +-
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_cpu_only.txt      | 2 +-
 requirements/full/requirements_nowheels.txt      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 5591c9ca..30ee0316 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -25,7 +25,7 @@ sentencepiece
 tensorboard
 torchao==0.15.*
 trafilatura==2.0.0
-transformers==5.3.*
+transformers==5.5.*
 triton-windows==3.5.1.post24; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index bb47ea4b..9edc1d95 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 5750b109..ff8687c1 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d8302d3d..208632e8 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index d3a5c008..4a7e5aaa 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 052085cc..6200589e 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb

From 468cb5cb87bf02f96efcd5acb1d1ac4b08c68273 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:59:28 -0700
Subject: [PATCH 21/76] Update accelerate

---
 requirements/full/requirements.txt               | 2 +-
 requirements/full/requirements_amd.txt           | 2 +-
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_cpu_only.txt      | 2 +-
 requirements/full/requirements_nowheels.txt      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 30ee0316..e5bec6ec 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 bitsandbytes==0.49.*
 datasets
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 9edc1d95..c6b5b2d0 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ff8687c1..ce671f0a 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 208632e8..d12d9f80 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 4a7e5aaa..4066b1af 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 6200589e..7173345a 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*

From 80e81a54cacacbd8aa16ccf312ae0e574e4b416c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 11:11:44 -0700
Subject: [PATCH 22/76] Remove ik macOS wheels from full requirements

---
 requirements/full/requirements_apple_intel.txt   | 1 -
 requirements/full/requirements_apple_silicon.txt | 1 -
 2 files changed, 2 deletions(-)

diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ce671f0a..55a313e9 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -38,4 +38,3 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d12d9f80..a6d34cbb 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -38,4 +38,3 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"

From f6f8f14c8d0993327a2c86dfa3c976a7c1c569fc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:13:39 -0300
Subject: [PATCH 23/76] Security: Fix SSRF in superbooga extensions

---
 extensions/superbooga/download_urls.py   | 3 +++
 extensions/superboogav2/download_urls.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/extensions/superbooga/download_urls.py b/extensions/superbooga/download_urls.py
index 424a9885..b28fea42 100644
--- a/extensions/superbooga/download_urls.py
+++ b/extensions/superbooga/download_urls.py
@@ -2,8 +2,11 @@ import concurrent.futures
 
 import requests
 
+from modules.web_search import _validate_url
+
 
 def download_single(url):
+    _validate_url(url)
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
     }
diff --git a/extensions/superboogav2/download_urls.py b/extensions/superboogav2/download_urls.py
index 5b5a2e17..4d8b98b1 100644
--- a/extensions/superboogav2/download_urls.py
+++ b/extensions/superboogav2/download_urls.py
@@ -5,12 +5,14 @@ import requests
 from bs4 import BeautifulSoup
 
 import extensions.superboogav2.parameters as parameters
+from modules.web_search import _validate_url
 
 from .data_processor import process_and_add_to_collector
 from .utils import create_metadata_source
 
 
 def _download_single(url):
+    _validate_url(url)
     response = requests.get(url, timeout=5)
     if response.status_code == 200:
         return response.content

From 091037ec20743ac6c7bccb75b59743045a692c4a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:13:45 -0300
Subject: [PATCH 24/76] Fix top_logprobs_ids missing for llama.cpp loader

---
 modules/api/completions.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 4eb8fdad..98bcff47 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -299,8 +299,9 @@ def format_completion_logprobs(entries):
             t = item.get('token', '')
             lp = item.get('logprob', item.get('prob', 0))
             top_dict[t] = lp
-            if 'token_id' in item:
-                top_dict_ids[item['token_id']] = lp
+            tid = item.get('token_id', item.get('id'))
+            if tid is not None:
+                top_dict_ids[tid] = lp
         top_logprobs.append(top_dict)
         top_logprobs_ids.append(top_dict_ids if top_dict_ids else None)
 

From a61bde509ff44a0f7662067bc94efd7f103f3162 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:30:02 -0700
Subject: [PATCH 25/76] Update llama.cpp

---
 requirements/full/requirements.txt                   | 8 ++++----
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 8 ++++----
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 15 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index e5bec6ec..f1a953a5 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c6b5b2d0..211600e2 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 55a313e9..54d904dd 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index a6d34cbb..8829eb44 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 4066b1af..0a8cfac6 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,7 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 1180b42d..607c642f 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 57aa6262..f0af64c8 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 894c9199..c5f351c5 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 32b9727f..5287aa25 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 73b72832..038318ab 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index ad96bbe2..d87c741e 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 2fa037f7..3e2471ae 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index b43b51c4..8272b9b6 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # ik_llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 12767285..98ef23d7 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index a5df3ad4..157ad313 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From d84157403a1c8b65f8597302463e46c28a6659d1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:31:44 -0700
Subject: [PATCH 26/76] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index f1a953a5..b38ae848 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 211600e2..7fb3a7d9 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 54d904dd..4a0f764c 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 8829eb44..942d5d71 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 0a8cfac6..6b61dca7 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 7173345a..a4d6cc97 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 607c642f..5aff54b2 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index f0af64c8..0771f53e 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index c5f351c5..427d59b2 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 5287aa25..c47a6ca1 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 038318ab..e491e357 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index d87c741e..5870983a 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 3e2471ae..d11d337d 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index 8272b9b6..c2b69e1c 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 98ef23d7..7f280930 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index e38140ce..322056be 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 157ad313..dfd52be5 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 7aab2fdf9aefb0f14fbf58e132a2a9a5850f8319 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:50:42 -0700
Subject: [PATCH 27/76] API: Improve cache clearing in logprobs

---
 modules/api/completions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 98bcff47..f2282731 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -89,6 +89,7 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         return [{"token": first_token_str, "null_logprob": True}]
 
     import torch
+    from modules.torch_utils import clear_torch_cache
 
     if hasattr(model, 'get_prompt_logits'):
         logits = model.get_prompt_logits(input_ids)
@@ -143,7 +144,7 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         del chunk_logits, chunk_lse, chunk_top_values
 
     del logits
-    torch.cuda.empty_cache()
+    clear_torch_cache()
 
     all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0)
     all_top_indices = torch.cat(all_top_indices_list, dim=0)

From b108c55353e11343a9e8f8566d92e52e868dfa69 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 19:10:41 -0700
Subject: [PATCH 28/76] Fix portable builds not starting due to missing ik
 element

---
 modules/loaders.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index cb1f3d3b..31b1b51a 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -291,16 +291,21 @@ def blacklist_samplers(loader, dynamic_temperature):
 
 @functools.cache
 def get_all_params():
+    from modules import shared
     all_params = set()
     for k in loaders_and_params:
         for el in loaders_and_params[k]:
             all_params.add(el)
 
+    if shared.args.portable:
+        all_params.discard('ik')
+
     return sorted(all_params)
 
 
+@functools.cache
 def list_model_elements():
-    return [
+    elements = [
         'filter_by_loader',
         'loader',
         'cpu_memory',
@@ -346,9 +351,14 @@ def list_model_elements():
         'spec_ngram_size_m',
         'spec_ngram_min_hits',
         'mmproj',
-        'ik',
     ]
 
+    from modules import shared
+    if not shared.args.portable:
+        elements.append('ik')
+
+    return elements
+
 
 def make_loader_params_visible(loader):
     import gradio as gr

From 6e2b70bde60c089f97b0abe97bb1b594cce75357 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 20:26:09 -0700
Subject: [PATCH 29/76] Add Gemma 4 tool-calling support

---
 modules/chat.py         | 57 +++++++++++++++++++++++++++++
 modules/reasoning.py    |  1 +
 modules/tool_parsing.py | 79 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 137 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index edda11b0..818309e6 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -210,6 +210,57 @@ def _expand_tool_sequence(tool_seq):
     return messages
 
 
+def _convert_to_tool_responses(messages):
+    """Convert role:'tool' messages to tool_responses format.
+
+    Templates like Gemma 4 expect tool results as a ``tool_responses``
+    attribute on a message rather than separate ``role: 'tool'`` messages.
+    This function groups consecutive tool messages and rewrites them.
+    """
+    result = []
+    tc_id_to_name = {}
+
+    i = 0
+    while i < len(messages):
+        msg = messages[i]
+
+        if msg.get('tool_calls'):
+            for tc in msg['tool_calls']:
+                tc_id = tc.get('id', '')
+                func_name = tc.get('function', {}).get('name', 'unknown')
+                if tc_id:
+                    tc_id_to_name[tc_id] = func_name
+
+        if msg.get('role') == 'tool':
+            tool_responses = []
+            while i < len(messages) and messages[i].get('role') == 'tool':
+                tool_msg = messages[i]
+                tc_id = tool_msg.get('tool_call_id', '')
+                func_name = tc_id_to_name.get(tc_id, 'unknown')
+
+                content = tool_msg.get('content', '')
+                try:
+                    response = json.loads(content)
+                except (json.JSONDecodeError, ValueError, TypeError):
+                    response = content
+
+                tool_responses.append({
+                    'name': func_name,
+                    'response': response,
+                })
+                i += 1
+
+            result.append({
+                'role': 'tool',
+                'tool_responses': tool_responses,
+            })
+        else:
+            result.append(msg)
+            i += 1
+
+    return result
+
+
 def _format_attachments(attachments, include_text=True):
     """Build image ref and text attachment strings from a list of attachments."""
     attachments_text = ""
@@ -267,6 +318,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
         tools=state['tools'] if 'tools' in state else None,
     )
 
+    active_template_str = state['instruction_template_str'] if state['mode'] == 'instruct' else chat_template_str
+    uses_tool_responses = 'tool_responses' in active_template_str
+
     messages = []
 
     if state['mode'] == 'instruct':
@@ -503,6 +557,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
         return prompt
 
+    if uses_tool_responses:
+        messages = _convert_to_tool_responses(messages)
+
     prompt = make_prompt(messages)
 
     # Handle truncation
diff --git a/modules/reasoning.py b/modules/reasoning.py
index aa1939b8..4a7cfa79 100644
--- a/modules/reasoning.py
+++ b/modules/reasoning.py
@@ -7,6 +7,7 @@ THINKING_FORMATS = [
     ('<|channel|>analysis<|message|>', '<|end|>', '<|channel|>final<|message|>'),
     ('<|channel|>commentary<|message|>', '<|end|>', '<|channel|>final<|message|>'),
     ('<seed:think>', '</seed:think>', None),
+    ('<|channel>thought', '<channel|>', None),  # Gemma 4
     ('<|think|>', '<|end|>', '<|content|>'),  # Solar Open
     # ('Thinking Process:', '</think>', None),  # Qwen3.5 verbose thinking outside tags -- removed: too prone to false positives in streaming
     (None, '</think>', None),  # End-only variant (e.g., Qwen3-next)
diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
index ec49f77f..45da25c9 100644
--- a/modules/tool_parsing.py
+++ b/modules/tool_parsing.py
@@ -27,6 +27,7 @@ TOOL_CALL_OPENING_MARKERS = [
     '[TOOL_CALLS]',
     'to=functions.',
     '<|channel|>commentary',
+    '<|tool_call>call:',
 ]
 
 
@@ -400,6 +401,78 @@ def _parse_glm_tool_calls(answer: str, tool_names: list[str]):
     return matches, start_pos
 
 
+def _extract_gemma4_balanced(text, start):
+    """Extract balanced braces from Gemma 4 format, using <|"|> as string delimiters."""
+    if start >= len(text) or text[start] != '{':
+        return None
+    depth = 0
+    in_string = False
+    quote_token = '<|"|>'
+    quote_len = len(quote_token)
+    i = start
+    while i < len(text):
+        if text[i:i + quote_len] == quote_token:
+            in_string = not in_string
+            i += quote_len
+            continue
+        if in_string:
+            i += 1
+            continue
+        c = text[i]
+        if c == '{':
+            depth += 1
+        elif c == '}':
+            depth -= 1
+            if depth == 0:
+                return text[start:i + 1]
+        i += 1
+    return None
+
+
+def _parse_gemma4_tool_calls(answer: str, tool_names: list[str]):
+    """Parse Gemma 4-style tool calls.
+
+    Format:
+        <|tool_call>call:func_name{key:<|"|>value<|"|>,...}<tool_call|>
+
+    Values use <|"|> tokens instead of standard JSON quotes, and keys are
+    bare identifiers.
+    """
+    matches = []
+    start_pos = None
+
+    for m in re.finditer(r'<\|tool_call>call:([^\s{]+)\s*', answer):
+        func_name = m.group(1).strip()
+        if func_name not in tool_names:
+            continue
+
+        brace_start = m.end()
+        if brace_start >= len(answer) or answer[brace_start] != '{':
+            continue
+
+        content = _extract_gemma4_balanced(answer, brace_start)
+        if content is None:
+            continue
+
+        # Convert to JSON: split on <|"|> tokens so that key quoting
+        # only applies outside string values (even-indexed parts),
+        # then rejoin with real quotes.
+        parts = content.split('<|"|>')
+        for idx in range(0, len(parts), 2):
+            parts[idx] = re.sub(r'(^|[{,\[])\s*(\w+)\s*:', r'\1"\2":', parts[idx])
+        json_str = '"'.join(parts)
+
+        try:
+            arguments = json.loads(json_str)
+            if start_pos is None:
+                start_pos = m.start()
+            matches.append(_make_tool_call(func_name, arguments))
+        except (json.JSONDecodeError, ValueError):
+            pass
+
+    return matches, start_pos
+
+
 def _parse_pythonic_tool_calls(answer: str, tool_names: list[str]):
     """Parse pythonic-style tool calls used by Llama 4 and similar models.
 
@@ -472,6 +545,11 @@ TOOL_CALL_FORMATS = [
         'parser': _parse_channel_tool_calls,
         'markers': ['to=functions.', '<|channel|>commentary'],
     },
+    {
+        'template_hints': ['<|tool_call>call:'],
+        'parser': _parse_gemma4_tool_calls,
+        'markers': ['<|tool_call>call:'],
+    },
     {
         'template_hints': ['minimax:tool_call'],
         'parser': _parse_minimax_tool_calls,
@@ -504,6 +582,7 @@ ALL_PARSERS = [
     _parse_deep_seek_tool_calls,
     _parse_kimi_tool_calls,
     _parse_channel_tool_calls,
+    _parse_gemma4_tool_calls,
     _parse_minimax_tool_calls,
     _parse_glm_tool_calls,
     _parse_xml_param_tool_calls,

From 42dfcdfc5b50333c40a6adda0f4c8672508212cb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 20:46:27 -0700
Subject: [PATCH 30/76] API: Add warning about vanilla llama-server not
 supporting prompt logprobs + instructions

---
 modules/llama_cpp_server.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 34080466..2d873f00 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -333,6 +333,12 @@ class LlamaServer:
 
         prompt_probs = result.get("prompt_probabilities", [])
         if not prompt_probs:
+            logger.warning(
+                "The llama.cpp server did not return prompt probabilities. "
+                "This feature requires a custom build with prompt_logprobs support. "
+                "See: https://github.com/oobabooga/llama.cpp/tree/prompt-logprobs "
+                "or https://github.com/oobabooga/ik_llama.cpp/tree/prompt-logprobs"
+            )
             return []
 
         # Null first token (no conditioning context); use empty string for BOS

From a1cb5b5dc05d2540640069b9549dd93557c81a16 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 21:56:06 -0700
Subject: [PATCH 31/76] llama.cpp: Disable jinja by default (we use Python
 jinja, not cpp jinja)

This was causing template compilation issues with qwen models.
---
 modules/llama_cpp_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2d873f00..a4390adb 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -418,6 +418,7 @@ class LlamaServer:
             "--ubatch-size", str(shared.args.ubatch_size),
             "--port", str(self.port),
             "--no-webui",
+            "--no-jinja",
             "--flash-attn", "on",
         ]
 

From 000d776967f0a73684b85c9d052a738dba073fb6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 05:49:03 -0700
Subject: [PATCH 32/76] Revert "llama.cpp: Disable jinja by default (we use
 Python jinja, not cpp jinja)"

This reverts commit a1cb5b5dc05d2540640069b9549dd93557c81a16.
---
 modules/llama_cpp_server.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index a4390adb..2d873f00 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -418,7 +418,6 @@ class LlamaServer:
             "--ubatch-size", str(shared.args.ubatch_size),
             "--port", str(self.port),
             "--no-webui",
-            "--no-jinja",
             "--flash-attn", "on",
         ]
 

From 66d1a22c733b04c38d89a128a7eeacd8e142e629 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 05:56:36 -0700
Subject: [PATCH 33/76] Fix crash when no model is selected (None passed to
 resolve_model_path)

---
 modules/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/utils.py b/modules/utils.py
index b01953ee..c4acf714 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -105,6 +105,9 @@ def resolve_model_path(model_name_or_path, image_model=False):
     before the default models directory.
     """
 
+    if model_name_or_path is None:
+        raise FileNotFoundError("No model specified.")
+
     path_candidate = Path(model_name_or_path)
     if path_candidate.exists():
         return path_candidate

From 8bba9ecc3fc0afc044a1f6810f014f721dbb7809 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 05:58:05 -0700
Subject: [PATCH 34/76] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index b38ae848..91d27d86 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 7fb3a7d9..eea869b1 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 4a0f764c..391973b7 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 942d5d71..4d0ffe29 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 6b61dca7..44e54eaa 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index a4d6cc97..41d6aad6 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 5aff54b2..91b58e0d 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 0771f53e..36d6dcb1 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 427d59b2..0d882b83 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index c47a6ca1..d79832e5 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index e491e357..3e1de9c9 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 5870983a..40e68d99 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index d11d337d..9e61ad3f 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index c2b69e1c..cdd1218f 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 7f280930..b7422758 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 322056be..372e718b 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index dfd52be5..3e539988 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 95d6c53e13673defecff6def4aead4c7ea157911 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 07:30:48 -0700
Subject: [PATCH 35/76] Revert "API: Add warning about vanilla llama-server not
 supporting prompt logprobs + instructions"

This reverts commit 42dfcdfc5b50333c40a6adda0f4c8672508212cb.
---
 modules/llama_cpp_server.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2d873f00..34080466 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -333,12 +333,6 @@ class LlamaServer:
 
         prompt_probs = result.get("prompt_probabilities", [])
         if not prompt_probs:
-            logger.warning(
-                "The llama.cpp server did not return prompt probabilities. "
-                "This feature requires a custom build with prompt_logprobs support. "
-                "See: https://github.com/oobabooga/llama.cpp/tree/prompt-logprobs "
-                "or https://github.com/oobabooga/ik_llama.cpp/tree/prompt-logprobs"
-            )
             return []
 
         # Null first token (no conditioning context); use empty string for BOS

From 131a9a0140baeef90061bb97065d32b23385e142 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 09:15:03 -0700
Subject: [PATCH 36/76] Update llama.cpp

---
 requirements/full/requirements.txt                   | 8 ++++----
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 8 ++++----
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 15 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 91d27d86..8816f76e 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index eea869b1..466b680f 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 391973b7..49a948c7 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 4d0ffe29..508c137a 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 44e54eaa..17ecbd61 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,7 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 91b58e0d..73d3f3b6 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 36d6dcb1..95c23424 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 0d882b83..4d18875b 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index d79832e5..a181212b 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 3e1de9c9..5ddd53e1 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 40e68d99..3ab238ac 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 9e61ad3f..624fbe5a 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index cdd1218f..c1ab1758 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # ik_llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index b7422758..6d17200d 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 3e539988..cdeb2b79 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 8e8e1ba8984cc3cef4b4c0d88e7c9eb7977dd3fe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 09:50:15 -0700
Subject: [PATCH 37/76] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 8816f76e..3b5501f4 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 466b680f..eb7f8618 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 49a948c7..e11522a9 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 508c137a..76d8a709 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 17ecbd61..8d4df234 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 41d6aad6..cecc2a25 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 73d3f3b6..42db46a4 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 95c23424..5e0b589b 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 4d18875b..711c68f9 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index a181212b..f1bbccf0 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 5ddd53e1..dc2807f2 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 3ab238ac..6d34b894 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 624fbe5a..5b3bac83 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index c1ab1758..b8e0897d 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 6d17200d..fd623b0b 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 372e718b..92c910ac 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index cdeb2b79..bc17dda9 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio-4.37.2+custom.15-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.15/gradio_client-1.0.2+custom.15-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 6b66da84d2dbccf63ffe79939edf92ad935bb3ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 10:01:51 -0700
Subject: [PATCH 38/76] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 3b5501f4..4d9d243c 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index eb7f8618..b6caf320 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index e11522a9..ae2f6263 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 76d8a709..f4783d2b 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 8d4df234..11d670b6 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index cecc2a25..d4b1ca80 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 42db46a4..901d1494 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 5e0b589b..5705c64c 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 711c68f9..3c3deaed 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index f1bbccf0..3f1e814c 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index dc2807f2..3bfed7f8 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 6d34b894..dd7059c8 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 5b3bac83..4fdd10fe 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index b8e0897d..1b377463 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index fd623b0b..b85607d3 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 92c910ac..39628ee6 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index bc17dda9..16aa5593 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio-4.37.2+custom.16-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.16/gradio_client-1.0.2+custom.16-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 5fb8c4fbd6d8112429335b48c93a6fe941f4c5e3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 11:02:00 -0700
Subject: [PATCH 39/76] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 4d9d243c..b7a5ca97 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index b6caf320..2c627585 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ae2f6263..7e3fc35f 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index f4783d2b..2603201d 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 11d670b6..fe3bf3ba 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index d4b1ca80..acae301e 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 901d1494..56795843 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 5705c64c..abaa1338 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 3c3deaed..b22a03d9 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 3f1e814c..97c5903c 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 3bfed7f8..57e92f74 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index dd7059c8..1f7d27a7 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 4fdd10fe..65f6a004 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index 1b377463..0a82adb7 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index b85607d3..3d812045 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 39628ee6..91bef10b 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 16aa5593..7c61f0cc 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio-4.37.2+custom.17-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.17/gradio_client-1.0.2+custom.17-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 8ecdb41078cfaf54fa0be66d54cf6e3911936b68 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Fri, 3 Apr 2026 19:36:50 -0300
Subject: [PATCH 40/76] fix(security): sanitize filenames in all prompt file
 operations (CWE-22) (#7462)

---------

Co-authored-by: Alex Chen <ffulbtech@gmail.com>
---
 modules/prompts.py     | 2 ++
 modules/ui_default.py  | 5 ++++-
 modules/ui_notebook.py | 6 +++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/modules/prompts.py b/modules/prompts.py
index d107ce5a..85dc32e3 100644
--- a/modules/prompts.py
+++ b/modules/prompts.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 
 from modules import shared, utils
+from modules.utils import sanitize_filename
 from modules.text_generation import get_encoded_length
 
 
@@ -18,6 +19,7 @@ def load_prompt(fname):
 
         return initial_content
 
+    fname = sanitize_filename(fname)
     file_path = shared.user_data_dir / 'logs' / 'notebook' / f'{fname}.txt'
     if file_path.exists():
         with open(file_path, 'r', encoding='utf-8') as f:
diff --git a/modules/ui_default.py b/modules/ui_default.py
index 2c367cca..48cb2fc2 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -10,7 +10,7 @@ from modules.text_generation import (
     stop_everything_event
 )
 from modules.ui_notebook import store_notebook_state_and_debounce
-from modules.utils import gradio
+from modules.utils import gradio, sanitize_filename
 
 inputs = ('textbox-default', 'interface_state')
 outputs = ('output_textbox', 'html-default')
@@ -167,6 +167,7 @@ def handle_new_prompt():
 
 
 def handle_delete_prompt_confirm_default(prompt_name):
+    prompt_name = sanitize_filename(prompt_name)
     available_prompts = utils.get_available_prompts()
     current_index = available_prompts.index(prompt_name) if prompt_name in available_prompts else 0
 
@@ -199,6 +200,8 @@ def handle_rename_prompt_click_default(current_name):
 
 
 def handle_rename_prompt_confirm_default(new_name, current_name):
+    new_name = sanitize_filename(new_name)
+    current_name = sanitize_filename(current_name)
     old_path = shared.user_data_dir / "logs" / "notebook" / f"{current_name}.txt"
     new_path = shared.user_data_dir / "logs" / "notebook" / f"{new_name}.txt"
 
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index f550e646..88f00ac5 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -11,7 +11,7 @@ from modules.text_generation import (
     get_token_ids,
     stop_everything_event
 )
-from modules.utils import gradio
+from modules.utils import gradio, sanitize_filename
 
 _notebook_file_lock = threading.Lock()
 _notebook_auto_save_timer = None
@@ -202,6 +202,7 @@ def handle_new_prompt():
 
 
 def handle_delete_prompt_confirm_notebook(prompt_name):
+    prompt_name = sanitize_filename(prompt_name)
     available_prompts = utils.get_available_prompts()
     current_index = available_prompts.index(prompt_name) if prompt_name in available_prompts else 0
 
@@ -233,6 +234,8 @@ def handle_rename_prompt_click_notebook(current_name):
 
 
 def handle_rename_prompt_confirm_notebook(new_name, current_name):
+    new_name = sanitize_filename(new_name)
+    current_name = sanitize_filename(current_name)
     old_path = shared.user_data_dir / "logs" / "notebook" / f"{current_name}.txt"
     new_path = shared.user_data_dir / "logs" / "notebook" / f"{new_name}.txt"
 
@@ -249,6 +252,7 @@ def handle_rename_prompt_confirm_notebook(new_name, current_name):
 
 def autosave_prompt(text, prompt_name):
     """Automatically save the text to the selected prompt file"""
+    prompt_name = sanitize_filename(prompt_name)
     if prompt_name and text.strip():
         prompt_path = shared.user_data_dir / "logs" / "notebook" / f"{prompt_name}.txt"
         prompt_path.parent.mkdir(parents=True, exist_ok=True)

From fc35acab9b07f1b0dc57b89a7cb459894aa44c5b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 16:56:15 -0700
Subject: [PATCH 41/76] API: Fix tool call parser crash on non-dict JSON output

---
 modules/tool_parsing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
index 45da25c9..919e523a 100644
--- a/modules/tool_parsing.py
+++ b/modules/tool_parsing.py
@@ -699,6 +699,8 @@ def parse_tool_call(answer: str, tool_names: list[str], return_prefix: bool = Fa
                 if not isinstance(candidates, list):
                     candidates = [candidates]
                 for candidate_dict in candidates:
+                    if not isinstance(candidate_dict, dict):
+                        continue
                     checked_candidate = check_and_sanitize_tool_call_candidate(candidate_dict, tool_names)
                     if checked_candidate is not None:
                         matches.append(checked_candidate)

From 2fbaee58cd7c65c22267410f8a77b6c04b3ee954 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 20:54:28 -0700
Subject: [PATCH 42/76] Add Windows + ROCm portable builds

---
 .github/workflows/build-everything-tgw.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 40d9db5d..0b65dfd6 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -41,6 +41,13 @@ jobs:
       version: ${{ inputs.version }}
       config: 'os:ubuntu-22.04'
 
+  build_release_rocm_windows:
+    name: ROCm Windows
+    uses: ./.github/workflows/build-portable-release-rocm.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
   build_release_rocm_linux:
     name: ROCm Linux
     uses: ./.github/workflows/build-portable-release-rocm.yml

From 54b2f39c780a0a27d2fe27dc114a0aabe09e0249 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Apr 2026 22:07:21 -0700
Subject: [PATCH 43/76] Cleanup modules/chat.py

---
 modules/chat.py | 51 ++++++++++++++++++++-----------------------------
 1 file changed, 21 insertions(+), 30 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 818309e6..e28d3963 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -694,7 +694,7 @@ def get_stopping_strings(state):
         # Find positions of each message content
         first_user_end = prompt.find("first user message") + len("first user message")
         first_assistant_start = prompt.find("first assistant message")
-        first_assistant_end = prompt.find("first assistant message") + len("first assistant message")
+        first_assistant_end = first_assistant_start + len("first assistant message")
         second_user_start = prompt.find("second user message")
         second_assistant_end = prompt.find("second assistant message") + len("second assistant message")
 
@@ -1819,7 +1819,8 @@ def load_history(unique_id, character, mode):
     if not p.exists():
         return {'internal': [], 'visible': [], 'metadata': {}}
 
-    f = json.loads(open(p, 'rb').read())
+    with open(p, 'rb') as fh:
+        f = json.loads(fh.read())
     if 'internal' in f and 'visible' in f:
         history = f
     else:
@@ -1883,19 +1884,17 @@ def generate_pfp_cache(character):
     if not cache_folder.exists():
         cache_folder.mkdir()
 
-    for path in [shared.user_data_dir / 'characters' / f"{character}.{extension}" for extension in ['png', 'jpg', 'jpeg']]:
+    for extension in ['png', 'jpg', 'jpeg']:
+        path = shared.user_data_dir / 'characters' / f"{character}.{extension}"
         if path.exists():
             original_img = Image.open(path)
-            # Define file paths
-            pfp_path = Path(f'{cache_folder}/pfp_character.png')
-            thumb_path = Path(f'{cache_folder}/pfp_character_thumb.png')
+            pfp_path = cache_folder / 'pfp_character.png'
+            thumb_path = cache_folder / 'pfp_character_thumb.png'
 
-            # Save main picture and thumbnail
             original_img.save(pfp_path, format='PNG')
             thumb = make_thumbnail(original_img)
             thumb.save(thumb_path, format='PNG')
 
-            # Return the path to the thumbnail, not the in-memory PIL Image object.
             return str(thumb_path)
 
     return None
@@ -1916,13 +1915,13 @@ def load_character(character, name1, name2):
         logger.error(f"Could not find the character \"{character}\" inside {shared.user_data_dir}/characters. No character has been loaded.")
         raise ValueError
 
-    file_contents = open(filepath, 'r', encoding='utf-8').read()
+    with open(filepath, 'r', encoding='utf-8') as fh:
+        file_contents = fh.read()
     data = json.loads(file_contents) if extension == "json" else yaml.safe_load(file_contents)
     cache_folder = Path(shared.args.disk_cache_dir)
 
-    for path in [Path(f"{cache_folder}/pfp_character.png"), Path(f"{cache_folder}/pfp_character_thumb.png")]:
-        if path.exists():
-            path.unlink()
+    for path in [cache_folder / "pfp_character.png", cache_folder / "pfp_character_thumb.png"]:
+        path.unlink(missing_ok=True)
 
     picture = generate_pfp_cache(character)
 
@@ -1978,9 +1977,7 @@ def clear_character_for_ui(state):
     # Clear the cache files
     cache_folder = Path(shared.args.disk_cache_dir)
     for cache_file in ['pfp_character.png', 'pfp_character_thumb.png']:
-        cache_path = Path(f'{cache_folder}/{cache_file}')
-        if cache_path.exists():
-            cache_path.unlink()
+        (cache_folder / cache_file).unlink(missing_ok=True)
 
     return state, state['name2'], state['context'], state['greeting'], None
 
@@ -2075,11 +2072,10 @@ def upload_your_profile_picture(img_path):
         cache_folder.mkdir()
 
     if img is None:
-        if Path(f"{cache_folder}/pfp_me.png").exists():
-            Path(f"{cache_folder}/pfp_me.png").unlink()
+        (cache_folder / "pfp_me.png").unlink(missing_ok=True)
     else:
         img = make_thumbnail(img)
-        img.save(Path(f'{cache_folder}/pfp_me.png'))
+        img.save(cache_folder / 'pfp_me.png')
         logger.info(f'Profile picture saved to "{cache_folder}/pfp_me.png"')
 
 
@@ -2135,13 +2131,12 @@ def generate_user_pfp_cache(user):
     if not cache_folder.exists():
         cache_folder.mkdir()
 
-    for path in [shared.user_data_dir / 'users' / f"{user}.{extension}" for extension in ['png', 'jpg', 'jpeg']]:
+    for extension in ['png', 'jpg', 'jpeg']:
+        path = shared.user_data_dir / 'users' / f"{user}.{extension}"
         if path.exists():
             original_img = Image.open(path)
-            # Define file paths
-            pfp_path = Path(f'{cache_folder}/pfp_me.png')
+            pfp_path = cache_folder / 'pfp_me.png'
 
-            # Save thumbnail
             thumb = make_thumbnail(original_img)
             thumb.save(pfp_path, format='PNG')
             logger.info(f'User profile picture cached to "{pfp_path}"')
@@ -2173,9 +2168,7 @@ def load_user(user_name, name1, user_bio):
 
     # Clear existing user picture cache
     cache_folder = Path(shared.args.disk_cache_dir)
-    pfp_path = Path(f"{cache_folder}/pfp_me.png")
-    if pfp_path.exists():
-        pfp_path.unlink()
+    (cache_folder / "pfp_me.png").unlink(missing_ok=True)
 
     # Generate new picture cache
     picture = generate_user_pfp_cache(user_name)
@@ -2599,15 +2592,13 @@ def handle_character_picture_change(picture_path):
 
     if picture is not None:
         # Save to cache
-        picture.save(Path(f'{cache_folder}/pfp_character.png'), format='PNG')
+        picture.save(cache_folder / 'pfp_character.png', format='PNG')
         thumb = make_thumbnail(picture)
-        thumb.save(Path(f'{cache_folder}/pfp_character_thumb.png'), format='PNG')
+        thumb.save(cache_folder / 'pfp_character_thumb.png', format='PNG')
     else:
         # Remove cache files when picture is cleared
         for cache_file in ['pfp_character.png', 'pfp_character_thumb.png']:
-            cache_path = Path(f'{cache_folder}/{cache_file}')
-            if cache_path.exists():
-                cache_path.unlink()
+            (cache_folder / cache_file).unlink(missing_ok=True)
 
 
 def handle_mode_change(state):

From 16af11f8680180b24876ee77d5ce29fcd20cd8db Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 04:22:37 -0700
Subject: [PATCH 44/76] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ab6cc2e5..23cd09c5 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ A Gradio web UI for running Large Language Models locally. 100% private and offl
 ## Features
 
 - **Easy setup**: [Portable builds](https://github.com/oobabooga/text-generation-webui/releases) (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or a one-click installer for the full feature set.
-- **Multiple backends**: [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Switch between backends and models without restarting.
+- **Multiple backends**: [llama.cpp](https://github.com/ggerganov/llama.cpp), [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Switch between backends and models without restarting.
 - **OpenAI/Anthropic-compatible API**: Chat, Completions, and Messages endpoints with tool-calling support. Use as a local drop-in replacement for the OpenAI/Anthropic APIs ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
 - **Tool-calling**: Models can call custom functions during chat — web search, page fetching, math, and more. Each tool is a single `.py` file, easy to create and extend ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Tool-Calling-Tutorial)).
 - **Vision (multimodal)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).

From e0ad4e60df40378f9846e1fad20e337e4bb2cb8f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 09:57:07 -0700
Subject: [PATCH 45/76] UI: Fix tool buffer check truncating visible text at
 end of generation

---
 modules/chat.py         | 2 +-
 modules/tool_parsing.py | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index e28d3963..76b8694a 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1183,7 +1183,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     # visible text from before buffering started so raw markup doesn't flash
     # in the UI.  The internal text is left intact so the caller can still
     # parse tool calls from it.
-    if is_stream and _check_tool_markers and streaming_tool_buffer_check(output['internal'][-1][1], markers=_streaming_markers, tool_names=_tool_names, check_bare_names=_check_bare_names):
+    if is_stream and _check_tool_markers and streaming_tool_buffer_check(output['internal'][-1][1], markers=_streaming_markers, tool_names=_tool_names, check_bare_names=_check_bare_names, partial_match=False):
         output['visible'][-1][1] = _last_visible_before_tool_buffer or ''
 
     yield output
diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
index 919e523a..7fcf58b7 100644
--- a/modules/tool_parsing.py
+++ b/modules/tool_parsing.py
@@ -31,7 +31,7 @@ TOOL_CALL_OPENING_MARKERS = [
 ]
 
 
-def streaming_tool_buffer_check(text, markers=None, tool_names=None, check_bare_names=False):
+def streaming_tool_buffer_check(text, markers=None, tool_names=None, check_bare_names=False, partial_match=True):
     '''
     Check whether streaming output should be withheld because it may
     contain tool-call markup.
@@ -43,6 +43,10 @@ def streaming_tool_buffer_check(text, markers=None, tool_names=None, check_bare_
         tool_names: List of tool function names.
         check_bare_names: Whether to do partial-prefix matching on tool
                           names (for models with unknown template format).
+        partial_match: Whether to check partial prefixes of markers/names.
+                       Set to False for end-of-generation checks where a
+                       partial prefix is just normal text, not an incomplete
+                       tool call.
     '''
     # Strip thinking blocks so tool-call syntax inside <think> doesn't
     # trigger false positives.
@@ -60,6 +64,9 @@ def streaming_tool_buffer_check(text, markers=None, tool_names=None, check_bare_
             if name + '{' in text or name + ' {' in text:
                 return True
 
+    if not partial_match:
+        return False
+
     # Partial-prefix matching: only for template-specific markers.
     for marker in (markers if markers is not None else TOOL_CALL_OPENING_MARKERS):
         for prefix_len in range(min(len(marker) - 1, len(text)), 0, -1):

From 9183dc444e6c62ae4e33a759e03d9b2b66a49bf2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 10:48:53 -0700
Subject: [PATCH 46/76] API: Fix loader args leaking between sequential model
 loads

---
 modules/api/models.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/modules/api/models.py b/modules/api/models.py
index b89397d3..e0bd21f3 100644
--- a/modules/api/models.py
+++ b/modules/api/models.py
@@ -47,7 +47,6 @@ def _load_model(data):
 
     unload_model()
     model_settings = get_model_metadata(model_name)
-    update_model_parameters(model_settings)
 
     # Update shared.args with custom model loading settings
     # Security: only allow keys that correspond to model loading
@@ -55,6 +54,16 @@ def _load_model(data):
     # flags like trust_remote_code or extra_flags to be set via the API.
     blocked_keys = {'extra_flags'}
     allowed_keys = set(loaders.list_model_elements()) - blocked_keys
+
+    # Reset all loader args to their startup values before applying new ones,
+    # so settings from a previous API load don't leak into this one.
+    # Include blocked keys in the reset (safe: restores startup value, not API-controlled).
+    for k in allowed_keys | blocked_keys:
+        if hasattr(shared.args, k) and hasattr(shared.original_args, k):
+            setattr(shared.args, k, getattr(shared.original_args, k))
+
+    update_model_parameters(model_settings)
+
     if args:
         for k in args:
             if k in allowed_keys and hasattr(shared.args, k):

From 2eef90a32346f5acea1771803460504294b9d9be Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 11:00:14 -0700
Subject: [PATCH 47/76] API: Remove deprecated "settings" parameter from model
 load endpoint

---
 modules/api/models.py | 14 +-------------
 modules/api/script.py | 20 ++++----------------
 modules/api/typing.py |  1 -
 3 files changed, 5 insertions(+), 30 deletions(-)

diff --git a/modules/api/models.py b/modules/api/models.py
index e0bd21f3..5dd77850 100644
--- a/modules/api/models.py
+++ b/modules/api/models.py
@@ -1,5 +1,4 @@
 from modules import loaders, shared
-from modules.logging_colors import logger
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
 from modules.models_settings import get_model_metadata, update_model_parameters
@@ -42,8 +41,7 @@ def model_info_dict(model_name: str) -> dict:
 
 def _load_model(data):
     model_name = data["model_name"]
-    args = data["args"]
-    settings = data["settings"]
+    args = data.get("args")
 
     unload_model()
     model_settings = get_model_metadata(model_name)
@@ -71,16 +69,6 @@ def _load_model(data):
 
     shared.model, shared.tokenizer = load_model(model_name)
 
-    # Update shared.settings with custom generation defaults
-    if settings:
-        for k in settings:
-            if k in shared.settings:
-                shared.settings[k] = settings[k]
-                if k == 'truncation_length':
-                    logger.info(f"CONTEXT LENGTH (UPDATED): {shared.settings['truncation_length']}")
-                elif k == 'instruction_template':
-                    logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}")
-
 
 def list_loras():
     return {'lora_names': get_available_loras()[1:]}
diff --git a/modules/api/script.py b/modules/api/script.py
index 85f4974f..beed3d06 100644
--- a/modules/api/script.py
+++ b/modules/api/script.py
@@ -475,10 +475,8 @@ async def handle_list_models():
 @app.post("/v1/internal/model/load", dependencies=check_admin_key)
 async def handle_load_model(request_data: LoadModelRequest):
     '''
-    This endpoint is experimental and may change in the future.
-
-    The "args" parameter can be used to modify flags like "--load-in-4bit"
-    or "--n-gpu-layers" before loading a model. Example:
+    The "args" parameter can be used to modify loader flags before loading
+    a model. Example:
 
     ```
     "args": {
@@ -487,18 +485,8 @@ async def handle_load_model(request_data: LoadModelRequest):
     }
     ```
 
-    Note that those settings will remain after loading the model. So you
-    may need to change them back to load a second model.
-
-    The "settings" parameter is also a dict but with keys for the
-    shared.settings object. It can be used to modify the default instruction
-    template like this:
-
-    ```
-    "settings": {
-      "instruction_template": "Alpaca"
-    }
-    ```
+    Loader args are reset to their startup defaults between loads, so
+    settings from a previous load do not leak into the next one.
     '''
 
     try:
diff --git a/modules/api/typing.py b/modules/api/typing.py
index 1d486e8f..a758743e 100644
--- a/modules/api/typing.py
+++ b/modules/api/typing.py
@@ -271,7 +271,6 @@ class ModelListResponse(BaseModel):
 class LoadModelRequest(BaseModel):
     model_name: str
     args: dict | None = None
-    settings: dict | None = None
 
 
 class LoraListResponse(BaseModel):

From 7fed60f90ad9a37a9f0656d054bb5553a0e8da13 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 18:29:36 -0700
Subject: [PATCH 48/76] UI: Improve the hover menu looks

---
 css/main.css       | 91 +++++++++++++++++++++++++++++++++++-----------
 js/main.js         | 44 +++++++++++-----------
 modules/ui_chat.py |  2 +-
 3 files changed, 92 insertions(+), 45 deletions(-)

diff --git a/css/main.css b/css/main.css
index 009b7c0a..913576c5 100644
--- a/css/main.css
+++ b/css/main.css
@@ -735,7 +735,30 @@ audio {
 
 .hover-element {
     position: relative;
-    font-size: 24px;
+    padding-top: 4px;
+}
+
+#hover-element-button {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    width: 32px;
+    height: 32px;
+    border-radius: 0.5rem;
+    cursor: pointer;
+    color: gray;
+}
+
+#hover-element-button:hover {
+    background-color: var(--background-fill-secondary);
+}
+
+#hover-element-button svg {
+    color: inherit;
+}
+
+.dark #hover-element-button:hover {
+    background-color: var(--selected-item-color-dark);
 }
 
 .hover-menu {
@@ -743,27 +766,40 @@ audio {
     position: absolute;
     bottom: 100%;
     left: 0;
-    box-shadow: 0 2px 12px rgb(0 0 0 / 15%);
-    border-radius: 0.5rem;
+    background: white;
+    border: 1px solid rgba(0, 0, 0, 0.1);
+    box-shadow: 0 4px 16px rgb(0 0 0 / 12%), 0 1px 3px rgb(0 0 0 / 8%);
+    border-radius: 0.75rem;
     z-index: 10000;
     min-width: 330px;
     flex-direction: column;
-    overflow: hidden;
+    padding: 4px;
+}
+
+.hover-menu::before {
+    content: '';
+    position: absolute;
+    top: 100%;
+    left: 0;
+    width: 100%;
+    height: 8px;
+}
+
+.hover-menu > * {
+    border: none !important;
+    box-shadow: none !important;
 }
 
 .hover-menu button {
     width: 100%;
-    background: white !important;
-    border-radius: 0 !important;
+    background: transparent !important;
+    border: none !important;
+    border-radius: 0.5rem !important;
     justify-content: space-between;
     margin: 0 !important;
     height: 36px;
-    border-color: transparent !important;
-    transition: background-color 0.15s ease;
-}
-
-.hover-menu button:not(#clear-history-confirm) {
-    border-bottom: 0 !important;
+    font-weight: 500;
+    box-shadow: none !important;
 }
 
 .hover-menu button:hover {
@@ -775,19 +811,26 @@ audio {
 }
 
 #show-controls {
-    background-color: white;
-    border-color: transparent !important;
+    background-color: transparent;
+    border: none !important;
     height: 36px;
-    border-radius: 0;
-    border-bottom: 0 !important;
+    border-radius: 0.5rem;
     padding-top: 3px;
     padding-left: 4px;
     display: flex;
     font-weight: normal;
 }
 
+#show-controls:hover {
+    background-color: #dbeafe;
+}
+
 .dark #show-controls {
-    background-color: var(--darker-gray);
+    background-color: transparent;
+}
+
+.dark #show-controls:hover {
+    background-color: var(--selected-item-color-dark);
 }
 
 #show-controls label {
@@ -797,12 +840,12 @@ audio {
     width: 100%;
     padding-right: 12px;
     gap: 10px;
-    font-weight: 600;
+    font-weight: 500;
     color: var(--button-secondary-text-color);
 }
 
 #show-controls label input {
-    margin-top: 4px;
+    margin-top: 5px;
 }
 
 .transparent-substring {
@@ -817,6 +860,7 @@ audio {
     min-width: 0 !important;
     display: flex;
     flex-direction: column-reverse;
+    padding-left: 12px;
     padding-right: 20px;
     padding-bottom: 3px;
     flex-grow: 0 !important;
@@ -1208,9 +1252,14 @@ audio {
     color: #9ca3af;
 }
 
+.dark .hover-menu {
+    background: var(--darker-gray);
+    border-color: transparent;
+    box-shadow: 0 4px 16px rgb(0 0 0 / 40%);
+}
+
 .dark .hover-menu button {
-    border-color: var(--border-color-primary);
-    background-color: var(--darker-gray) !important;
+    background-color: transparent !important;
 }
 
 .dark #chat-controls,
diff --git a/js/main.js b/js/main.js
index cba4c903..918c85c1 100644
--- a/js/main.js
+++ b/js/main.js
@@ -309,18 +309,19 @@ for (let i = 0; i < slimDropdownElements.length; i++) {
 // https://github.com/SillyTavern/SillyTavern/blob/6c8bd06308c69d51e2eb174541792a870a83d2d6/public/script.js
 //------------------------------------------------
 var buttonsInChat = document.querySelectorAll("#chat-tab #chat-buttons button, #chat-tab #chat-buttons #show-controls");
+var hoverContainer = document.getElementById("gr-hover-container");
 var button = document.getElementById("hover-element-button");
 var menu = document.getElementById("hover-menu");
 var istouchscreen = (navigator.maxTouchPoints > 0) || "ontouchstart" in document.documentElement;
 
 function showMenu() {
-  menu.style.display = "flex"; // Show the menu
+  menu.style.display = "flex";
 }
 
 function hideMenu() {
-  menu.style.display = "none"; // Hide the menu
+  menu.style.display = "none";
   if (!istouchscreen) {
-    document.querySelector("#chat-input textarea").focus(); // Focus on the chat input
+    document.querySelector("#chat-input textarea").focus();
   }
 }
 
@@ -329,7 +330,6 @@ if (buttonsInChat.length > 0) {
     const thisButton = buttonsInChat[i];
     menu.appendChild(thisButton);
 
-    // Only apply transformations to button elements
     if (thisButton.tagName.toLowerCase() === "button") {
       thisButton.addEventListener("click", () => {
         hideMenu();
@@ -339,7 +339,6 @@ if (buttonsInChat.length > 0) {
       const matches = buttonText.match(/(\(.*?\))/);
 
       if (matches && matches.length > 1) {
-        // Apply the transparent-substring class to the matched substring
         const substring = matches[1];
         const newText = buttonText.replace(substring, `&nbsp;<span class="transparent-substring">${substring.slice(1, -1)}</span>`);
         thisButton.innerHTML = newText;
@@ -348,16 +347,19 @@ if (buttonsInChat.length > 0) {
   }
 }
 
-function isMouseOverButtonOrMenu() {
-  return menu.matches(":hover") || button.matches(":hover");
-}
+var menuInteracting = false;
 
-button.addEventListener("mouseenter", function () {
+hoverContainer.addEventListener("mouseenter", function () {
   if (!istouchscreen) {
     showMenu();
   }
 });
 
+hoverContainer.addEventListener("mousedown", function () {
+  menuInteracting = true;
+  setTimeout(function () { menuInteracting = false; }, 300);
+});
+
 button.addEventListener("click", function () {
   if (menu.style.display === "flex") {
     hideMenu();
@@ -367,24 +369,20 @@ button.addEventListener("click", function () {
   }
 });
 
-// Delay to prevent menu hiding when the mouse leaves the button or menu
-function delayedHideMenu() {
-  setTimeout(function () {
-    if (!isMouseOverButtonOrMenu()) {
-      hideMenu();
-    }
-  }, 100);
-}
-
-// Add event listener for mouseleave on the button
-button.addEventListener("mouseleave", delayedHideMenu);
-// Add event listener for mouseleave on the menu
-menu.addEventListener("mouseleave", delayedHideMenu);
+hoverContainer.addEventListener("mouseleave", function () {
+  if (!istouchscreen) {
+    setTimeout(function () {
+      if (!hoverContainer.matches(":hover") && !menu.matches(":hover")) {
+        hideMenu();
+      }
+    }, 50);
+  }
+});
 
 // Add event listener for click anywhere in the document
 document.addEventListener("click", function (event) {
   // Check if the click is outside the button/menu and the menu is visible
-  if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") {
+  if (!menuInteracting && !event.target.closest("#gr-hover-container") && menu.style.display === "flex") {
     hideMenu();
   }
 
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 10d05f65..d9652253 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -52,7 +52,7 @@ def create_ui():
                 shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': [], 'metadata': {}}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
                 with gr.Row(elem_id="chat-input-row"):
                     with gr.Column(scale=1, elem_id='gr-hover-container'):
-                        gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
+                        gr.HTML(value='<div class="hover-element" onclick="void(0)"><span id="hover-element-button"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><line x1="4" y1="6" x2="20" y2="6"></line><line x1="4" y1="12" x2="20" y2="12"></line><line x1="4" y1="18" x2="20" y2="18"></line></svg></span><div class="hover-menu" id="hover-menu"></div></div>', elem_id='gr-hover')
 
                     with gr.Column(scale=10, elem_id='chat-input-container'):
                         shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf', 'image'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])

From ffea8f282e3a2f798d7bf5531be278754b47da21 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 18:53:13 -0700
Subject: [PATCH 49/76] UI: Improve message text contrast

---
 css/html_instruct_style.css |  2 +-
 css/main.css                | 17 +++++++++++++----
 modules/ui.py               |  2 +-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 458feafc..aa61f33b 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -13,7 +13,7 @@
     line-height: 28px !important;
 }
 
-.dark .chat .message-body :is(p,li,h1,h2,h3,h4,h5,h6),
+.dark .chat .message-body :is(p,li),
 .dark .chat .message-body em:not(:is(h1,h2,h3,h4,h5,h6,b,strong) em),
 .dark .chat .message-body q:not(:is(h1,h2,h3,h4,h5,h6,b,strong) q) {
     color: #d1d5db !important;
diff --git a/css/main.css b/css/main.css
index 913576c5..d06d2905 100644
--- a/css/main.css
+++ b/css/main.css
@@ -436,15 +436,24 @@ audio {
 .dark .message-body h4,
 .dark .message-body h5,
 .dark .message-body h6 {
-    color: white !important;
+    color: #e8e8e8 !important;
 }
 
 .dark .message-body blockquote {
     border-left-color: rgb(255 255 255 / 30%);
 }
 
+.message-body h1,
+.message-body h2,
+.message-body h3,
+.message-body h4,
+.message-body h5,
+.message-body h6 {
+    color: #1a1a1a;
+}
+
 .message-body h1 {
-    font-weight: 800;
+    font-weight: 700;
     font-size: 2.25em;
     margin-top: 0;
     margin-bottom: 0.8888889em;
@@ -476,13 +485,13 @@ audio {
 }
 
 .message-body h5 {
-    font-weight: normal;
+    font-weight: 600;
     font-size: 1em;
     margin: 0;
 }
 
 .message-body h6 {
-    font-weight: normal;
+    font-weight: 600;
     font-size: 1em;
     margin: 0;
 }
diff --git a/modules/ui.py b/modules/ui.py
index 02b5a9fb..73072cbe 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -75,7 +75,7 @@ if not shared.args.old_colors:
         background_fill_primary_dark='var(--darker-gray, #1C1C1D)',
         body_background_fill="white",
         block_background_fill="transparent",
-        body_text_color='rgb(64, 64, 64)',
+        body_text_color='#1a1a1a',
         button_secondary_background_fill="white",
         button_secondary_border_color="var(--border-color-primary)",
         block_title_text_color='*body_text_color',

From 41bce3f4dee83ede8ba05a3f3cdab9e729ec0979 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 19:07:23 -0700
Subject: [PATCH 50/76] UI: Improve scrollbars style

---
 css/main.css | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/css/main.css b/css/main.css
index d06d2905..c54367e6 100644
--- a/css/main.css
+++ b/css/main.css
@@ -246,8 +246,8 @@ button {
 
 .pretty_scrollbar::-webkit-scrollbar,
 #image-history-gallery > :nth-child(2)::-webkit-scrollbar {
-    width: 8px;
-    height: 8px;
+    width: 7px;
+    height: 7px;
 }
 
 .pretty_scrollbar::-webkit-scrollbar-track,
@@ -260,7 +260,7 @@ button {
 #image-history-gallery > :nth-child(2)::-webkit-scrollbar-thumb,
 #image-history-gallery > :nth-child(2)::-webkit-scrollbar-thumb:hover {
     background: var(--neutral-300);
-    border-radius: 30px;
+    border-radius: 9999px;
 }
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
@@ -268,18 +268,17 @@ button {
 .dark #image-history-gallery > :nth-child(2)::-webkit-scrollbar-thumb,
 .dark #image-history-gallery > :nth-child(2)::-webkit-scrollbar-thumb:hover {
     background: rgb(255 255 255 / 6.25%);
-    border-radius: 30px;
+    border-radius: 9999px;
 }
 
 .pretty_scrollbar::-webkit-resizer,
 #image-history-gallery > :nth-child(2)::-webkit-resizer {
-    background: #d2d2d8;
+    background: transparent;
 }
 
 .dark .pretty_scrollbar::-webkit-resizer,
 .dark #image-history-gallery > :nth-child(2)::-webkit-resizer {
-    background: rgb(255 255 255 / 10%);
-    border-radius: 10px;
+    background: transparent;
 }
 
 .pretty_scrollbar::-webkit-scrollbar-corner,
@@ -599,7 +598,7 @@ audio {
 }
 
 #chat-input textarea::-webkit-scrollbar {
-    width: 8px;
+    width: 7px;
 }
 
 #chat-input textarea::-webkit-scrollbar-track {
@@ -608,7 +607,7 @@ audio {
 
 #chat-input textarea::-webkit-scrollbar-thumb {
     background: var(--neutral-300);
-    border-radius: 30px;
+    border-radius: 9999px;
 }
 
 .dark #chat-input textarea::-webkit-scrollbar-thumb {
@@ -869,7 +868,6 @@ audio {
     min-width: 0 !important;
     display: flex;
     flex-direction: column-reverse;
-    padding-left: 12px;
     padding-right: 20px;
     padding-bottom: 3px;
     flex-grow: 0 !important;
@@ -2000,8 +1998,8 @@ thead + tbody tr:first-child th { border-top: 1px solid; }
 
 /* Pretty scrollbar for the tools list */
 #tools-group .wrap::-webkit-scrollbar {
-    width: 8px;
-    height: 8px;
+    width: 7px;
+    height: 7px;
 }
 
 #tools-group .wrap::-webkit-scrollbar-track {
@@ -2011,13 +2009,13 @@ thead + tbody tr:first-child th { border-top: 1px solid; }
 #tools-group .wrap::-webkit-scrollbar-thumb,
 #tools-group .wrap::-webkit-scrollbar-thumb:hover {
     background: var(--neutral-300);
-    border-radius: 30px;
+    border-radius: 9999px;
 }
 
 .dark #tools-group .wrap::-webkit-scrollbar-thumb,
 .dark #tools-group .wrap::-webkit-scrollbar-thumb:hover {
     background: rgb(255 255 255 / 6.25%);
-    border-radius: 30px;
+    border-radius: 9999px;
 }
 
 #tools-group .wrap::-webkit-scrollbar-corner {

From 8cb7fe9c470101d07f80b236a1d34b906bcdb25a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 19:14:17 -0700
Subject: [PATCH 51/76] UI: Improve message action icon visibility in light
 mode

---
 css/main.css | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/css/main.css b/css/main.css
index c54367e6..41b08308 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1428,12 +1428,11 @@ audio {
 }
 
 .footer-button svg {
-    stroke: rgb(156 163 175);
-    transition: stroke 0.2s;
+    stroke: rgb(107 114 128);
 }
 
 .footer-button:hover svg {
-    stroke: rgb(107 114 128);
+    stroke: rgb(64 64 64);
 }
 
 .dark .footer-button svg {

From 1b403a4ffab0833cdce527360f445a0003c7ea41 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 19:33:05 -0700
Subject: [PATCH 52/76] UI: Fix inline LaTeX rendering by protecting $...$ from
 markdown (closes #7423)

---
 modules/html_generator.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 8f3f261f..8dd46850 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -185,28 +185,29 @@ def process_markdown_content(string):
     if not string:
         return ""
 
-    # Define unique placeholders for LaTeX asterisks and underscores
+    # Define unique placeholders for LaTeX characters that conflict with markdown
     LATEX_ASTERISK_PLACEHOLDER = "LATEXASTERISKPLACEHOLDER"
     LATEX_UNDERSCORE_PLACEHOLDER = "LATEXUNDERSCOREPLACEHOLDER"
+    LATEX_PIPE_PLACEHOLDER = "LATEXPIPEPLACEHOLDER"
+
+    def protect_latex_content(content):
+        """Protect markdown-sensitive characters inside LaTeX."""
+        content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
+        content = content.replace('_', LATEX_UNDERSCORE_PLACEHOLDER)
+        content = content.replace('|', LATEX_PIPE_PLACEHOLDER)
+        return content
 
     def protect_asterisks_underscores_in_latex(match):
-        """A replacer function for re.sub to protect asterisks and underscores in multiple LaTeX formats."""
+        """A replacer function for re.sub to protect markdown-sensitive characters in multiple LaTeX formats."""
         # Check which delimiter group was captured
         if match.group(1) is not None:  # Content from $$...$$
-            content = match.group(1)
-            modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
-            modified_content = modified_content.replace('_', LATEX_UNDERSCORE_PLACEHOLDER)
-            return f'{modified_content}'
+            return protect_latex_content(match.group(1))
         elif match.group(2) is not None:  # Content from \[...\]
-            content = match.group(2)
-            modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
-            modified_content = modified_content.replace('_', LATEX_UNDERSCORE_PLACEHOLDER)
-            return f'\\[{modified_content}\\]'
+            return f'\\[{protect_latex_content(match.group(2))}\\]'
         elif match.group(3) is not None:  # Content from \(...\)
-            content = match.group(3)
-            modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
-            modified_content = modified_content.replace('_', LATEX_UNDERSCORE_PLACEHOLDER)
-            return f'\\({modified_content}\\)'
+            return f'\\({protect_latex_content(match.group(3))}\\)'
+        elif match.group(4) is not None:  # Content from $...$
+            return f'${protect_latex_content(match.group(4).strip())}$'
 
         return match.group(0)  # Fallback
 
@@ -240,7 +241,7 @@ def process_markdown_content(string):
     string = re.sub(r"(.)```", r"\1\n```", string)
 
     # Protect asterisks and underscores within all LaTeX blocks before markdown conversion
-    latex_pattern = re.compile(r'((?:^|[\r\n\s])\$\$[^`]*?\$\$)|\\\[(.*?)\\\]|\\\((.*?)\\\)',
+    latex_pattern = re.compile(r'((?:^|[\r\n\s])\$\$[^`]*?\$\$)|\\\[(.*?)\\\]|\\\((.*?)\\\)|(?<!\$)\$(?!\$)([^\$\n]*\\\\[^\$\n]*?)\$(?!\$)',
                                re.DOTALL)
     string = latex_pattern.sub(protect_asterisks_underscores_in_latex, string)
 
@@ -306,6 +307,7 @@ def process_markdown_content(string):
     # Restore the LaTeX asterisks and underscores after markdown conversion
     html_output = html_output.replace(LATEX_ASTERISK_PLACEHOLDER, '*')
     html_output = html_output.replace(LATEX_UNDERSCORE_PLACEHOLDER, '_')
+    html_output = html_output.replace(LATEX_PIPE_PLACEHOLDER, '|')
 
     # Remove extra newlines before </code>
     html_output = re.sub(r'\s*</code>', '</code>', html_output)

From 0c033caf0ef79838178238912df29cc47bb10ba3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 20:09:28 -0700
Subject: [PATCH 53/76] UI: Reduce spacing above chat input

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 41b08308..43e9684f 100644
--- a/css/main.css
+++ b/css/main.css
@@ -893,7 +893,7 @@ audio {
 }
 
 #chat-input-row {
-    padding: 1rem;
+    padding: 0.5rem 1rem 1rem;
 }
 
 #chat-col {

From dfd8ec9c4992f801304fc7efb89e7e47355fd18e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 20:13:20 -0700
Subject: [PATCH 54/76] UI: Make accordion outline styling global

---
 css/main.css             | 4 ++--
 modules/training.py      | 4 ++--
 modules/ui_model_menu.py | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/css/main.css b/css/main.css
index 43e9684f..459c9fab 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1443,12 +1443,12 @@ audio {
     stroke: rgb(209 213 219);
 }
 
-.tgw-accordion {
+.block:has(> .label-wrap) {
     padding: 10px 12px !important;
     border: 1px solid #d2d2d8;
 }
 
-.dark .tgw-accordion {
+.dark .block:has(> .label-wrap) {
     border: 1px solid var(--border-color-dark);
 }
 
diff --git a/modules/training.py b/modules/training.py
index 145353c6..bca4f02e 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -52,7 +52,7 @@ def create_ui():
                         with gr.Column():
                             always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).', elem_classes=['no-background'])
 
-                    with gr.Accordion(label='Target Modules', open=False, elem_classes='tgw-accordion'):
+                    with gr.Accordion(label='Target Modules', open=False):
                         gr.Markdown("Selects which modules to target in training. Targeting more modules is closer to a full fine-tune at the cost of increased VRAM and adapter size.")
                         all_linear = gr.Checkbox(label='Target all linear layers', value=True, info='Targets every nn.Linear layer except lm_head. Works for any model architecture. When checked, the individual module checkboxes below are ignored.', elem_classes=['no-background'])
                         with gr.Row():
@@ -87,7 +87,7 @@ def create_ui():
                             with gr.Row():
                                 lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='cosine', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.', elem_classes=['slim-dropdown'])
 
-                    with gr.Accordion(label='Advanced Options', open=False, elem_classes='tgw-accordion'):
+                    with gr.Accordion(label='Advanced Options', open=False):
                         with gr.Row():
                             with gr.Column():
                                 optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Optimizer algorithm. adamw_torch is the standard choice. adamw_bnb_8bit uses less VRAM. adafactor is memory-efficient for large models.', elem_classes=['slim-dropdown'])
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 16505afa..243079a0 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -67,13 +67,13 @@ def create_ui():
                             )
 
                             # Multimodal
-                            with gr.Accordion("Multimodal (vision)", open=False, elem_classes='tgw-accordion') as shared.gradio['mmproj_accordion']:
+                            with gr.Accordion("Multimodal (vision)", open=False) as shared.gradio['mmproj_accordion']:
                                 with gr.Row():
                                     shared.gradio['mmproj'] = gr.Dropdown(label="mmproj file", choices=utils.get_available_mmproj(), value=lambda: shared.args.mmproj or 'None', elem_classes='slim-dropdown', info=f'Select a file that matches your model. Must be placed in {shared.user_data_dir}/mmproj/', interactive=not mu)
                                     ui.create_refresh_button(shared.gradio['mmproj'], lambda: None, lambda: {'choices': utils.get_available_mmproj()}, 'refresh-button', interactive=not mu)
 
                             # Speculative decoding
-                            with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
+                            with gr.Accordion("Speculative decoding", open=False) as shared.gradio['speculative_decoding_accordion']:
                                 shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Maximum number of tokens to draft for speculative decoding. Recommended: 4 for draft model, 64 for n-gram.')
 
                                 gr.Markdown('#### Draft model')
@@ -92,7 +92,7 @@ def create_ui():
                                 shared.gradio['spec_ngram_min_hits'] = gr.Number(label="spec-ngram-min-hits", precision=0, step=1, value=shared.args.spec_ngram_min_hits, info='Minimum n-gram hits for ngram-map speculative decoding.', visible=shared.args.spec_type != 'none')
 
                     gr.Markdown("## Other options")
-                    with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'):
+                    with gr.Accordion("See more options", open=False):
                         with gr.Row():
                             with gr.Column():
                                 shared.gradio['parallel'] = gr.Slider(label="parallel", minimum=1, step=1, maximum=64, value=shared.args.parallel, info='Number of parallel request slots for the API. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')

From ee917cd5edfc3b192d4a3147001f0c1752a3e354 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 20:35:27 -0700
Subject: [PATCH 55/76] UI: Make table and hr borders more subtle

---
 css/html_instruct_style.css |  9 ---------
 css/main.css                | 16 +++++++++++++---
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index aa61f33b..fc20d166 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -19,15 +19,6 @@
     color: #d1d5db !important;
 }
 
-.chat .message-body :is(th, td),
-.prose hr {
-    border-color: #40404096 !important;
-}
-
-.dark .chat .message-body :is(th, td),
-.dark .prose hr {
-    border-color: rgb(255 255 255 / 30%) !important;
-}
 
 .chat .message-body :is(p, ul, ol) {
     margin: 1.25em 0 !important;
diff --git a/css/main.css b/css/main.css
index 459c9fab..d9dc5d2e 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1958,14 +1958,24 @@ table, tr, td, th, thead {
     border: 0;
 }
 
+.prose hr {
+    border-color: var(--border-color-primary);
+}
+
 td + td,
-th + th { border-left: 1px solid; }
+th + th {
+    border-left: 1px solid var(--border-color-primary) !important;
+}
 
 tr + tr td,
-tr + tr th { border-top: 1px solid; }
+tr + tr th {
+    border-top: 1px solid var(--border-color-primary) !important;
+}
 
 thead + tbody tr:first-child td,
-thead + tbody tr:first-child th { border-top: 1px solid; }
+thead + tbody tr:first-child th {
+    border-top: 1px solid var(--border-color-primary) !important;
+}
 
 /* ------------------------------------------------
    Tools CheckboxGroup - vertical DragDrop-like style

From e8b31c063a3a5d1486dba2969b116835aa6a56bf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 20:38:31 -0700
Subject: [PATCH 56/76] UI: Soften message action icons in light mode

---
 css/main.css | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index d9dc5d2e..a59e08ce 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1428,11 +1428,11 @@ audio {
 }
 
 .footer-button svg {
-    stroke: rgb(107 114 128);
+    stroke: rgb(140 140 148);
 }
 
 .footer-button:hover svg {
-    stroke: rgb(64 64 64);
+    stroke: rgb(107 114 128);
 }
 
 .dark .footer-button svg {

From 1f49a64e1ac1b2e700146956ac3dc17794d53243 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 20:44:37 -0700
Subject: [PATCH 57/76] UI: Improve blockquote border width and color

---
 css/main.css | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index a59e08ce..6685ab34 100644
--- a/css/main.css
+++ b/css/main.css
@@ -438,8 +438,9 @@ audio {
     color: #e8e8e8 !important;
 }
 
-.dark .message-body blockquote {
-    border-left-color: rgb(255 255 255 / 30%);
+.message-body blockquote {
+    border-left-width: 4px;
+    border-left-color: var(--border-color-primary);
 }
 
 .message-body h1,

From 91f9b01516ff50bd35477ccccff9b53a03041cf8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 21:13:20 -0700
Subject: [PATCH 58/76] UI: Minor change

---
 css/main.css | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/css/main.css b/css/main.css
index 6685ab34..7f47a3aa 100644
--- a/css/main.css
+++ b/css/main.css
@@ -642,6 +642,10 @@ audio {
     background: transparent;
 }
 
+#chat-input .thumbnails {
+    padding-top: 3px;
+}
+
 .chat-input-positioned {
     max-width: 54rem;
     left: 50%;

From 9805ddcde95f75bb1de100553dd3b604a4a6537c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 21:34:09 -0700
Subject: [PATCH 59/76] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index b7a5ca97..9f83830a 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 2c627585..b4b8386e 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 7e3fc35f..41ee6a60 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 2603201d..8be2f55e 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index fe3bf3ba..d7f1bf13 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index acae301e..7b331f96 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 56795843..b467cf26 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index abaa1338..4eca16e1 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index b22a03d9..55f8d3f8 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 97c5903c..54e8f350 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 57e92f74..f073a614 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 1f7d27a7..8cd40f39 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 65f6a004..fbb9125d 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index 0a82adb7..59fcfae1 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 3d812045..ffdbe568 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 91bef10b..4a47b1f0 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 7c61f0cc..97abd933 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio-4.37.2+custom.18-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.18/gradio_client-1.0.2+custom.18-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio-4.37.2+custom.19-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_client-1.0.2+custom.19-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From c63a79ee4871178aa4d7b7f570e5e9d45b0280de Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 23:15:14 -0700
Subject: [PATCH 60/76] Image generation: Embed generation metadata in API
 image responses

---
 modules/api/images.py          | 23 +++++++++++++++++------
 modules/ui_image_generation.py |  3 +++
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/modules/api/images.py b/modules/api/images.py
index 95704535..dde7d336 100644
--- a/modules/api/images.py
+++ b/modules/api/images.py
@@ -4,8 +4,11 @@ OpenAI-compatible image generation using local diffusion models.
 
 import base64
 import io
+import json
 import time
 
+from PIL.PngImagePlugin import PngInfo
+
 from .errors import ServiceUnavailableError
 from modules import shared
 
@@ -15,7 +18,7 @@ def generations(request):
     Generate images using the loaded diffusion model.
     Returns dict with 'created' timestamp and 'data' list of images.
     """
-    from modules.ui_image_generation import generate
+    from modules.ui_image_generation import build_generation_metadata, generate
 
     if shared.image_model is None:
         raise ServiceUnavailableError("No image model loaded. Load a model via the UI first.")
@@ -46,10 +49,18 @@ def generations(request):
     if not images:
         raise ServiceUnavailableError("Image generation failed or produced no images.")
 
-    # Build response
+    # Build response with per-batch metadata (seed increments per batch)
+    base_seed = state.get('image_seed_resolved', state['image_seed'])
+    batch_size = int(state['image_batch_size'])
+
     resp = {'created': int(time.time()), 'data': []}
-    for img in images:
-        b64 = _image_to_base64(img)
+    for idx, img in enumerate(images):
+        batch_seed = base_seed + idx // batch_size
+        metadata = build_generation_metadata(state, batch_seed)
+        metadata_json = json.dumps(metadata, ensure_ascii=False)
+        png_info = PngInfo()
+        png_info.add_text("image_gen_settings", metadata_json)
+        b64 = _image_to_base64(img, png_info)
 
         image_obj = {'revised_prompt': request.prompt}
 
@@ -63,7 +74,7 @@ def generations(request):
     return resp
 
 
-def _image_to_base64(image) -> str:
+def _image_to_base64(image, png_info=None) -> str:
     buffered = io.BytesIO()
-    image.save(buffered, format="PNG")
+    image.save(buffered, format="PNG", pnginfo=png_info)
     return base64.b64encode(buffered.getvalue()).decode('utf-8')
diff --git a/modules/ui_image_generation.py b/modules/ui_image_generation.py
index 1efb2479..727aa7b1 100644
--- a/modules/ui_image_generation.py
+++ b/modules/ui_image_generation.py
@@ -798,6 +798,9 @@ def generate(state, save_images=True):
         if seed == -1:
             seed = random.randint(0, 2**32 - 1)
 
+        # Store resolved seed back so callers (e.g. API) can access it
+        state['image_seed_resolved'] = seed
+
         device = get_device()
         if device is None:
             device = "cpu"

From 544fcb0b7f0344fac249005f869b02110da69738 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 23:29:57 -0700
Subject: [PATCH 61/76] Simplify modules/image_models.py

---
 modules/image_models.py | 69 ++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 46 deletions(-)

diff --git a/modules/image_models.py b/modules/image_models.py
index 290aaf19..eed8783c 100644
--- a/modules/image_models.py
+++ b/modules/image_models.py
@@ -10,72 +10,49 @@ def get_quantization_config(quant_method):
     Get the appropriate quantization config based on the selected method.
     Applies quantization to both the transformer and the text_encoder.
     """
+    if quant_method == 'none' or not quant_method:
+        return None
+
     import torch
-    # Import BitsAndBytesConfig from BOTH libraries to be safe
     from diffusers import BitsAndBytesConfig as DiffusersBnBConfig
     from diffusers import TorchAoConfig
     from diffusers.quantizers import PipelineQuantizationConfig
     from transformers import BitsAndBytesConfig as TransformersBnBConfig
 
-    if quant_method == 'none' or not quant_method:
-        return None
+    torchao_methods = {
+        'torchao-int8wo': 'int8wo',
+        'torchao-fp4': 'fp4_e2m1',
+        'torchao-float8wo': 'float8wo',
+    }
 
-    # Bitsandbytes 8-bit quantization
-    elif quant_method == 'bnb-8bit':
+    if quant_method == 'bnb-8bit':
         return PipelineQuantizationConfig(
             quant_mapping={
-                "transformer": DiffusersBnBConfig(
-                    load_in_8bit=True
-                ),
-                "text_encoder": TransformersBnBConfig(
-                    load_in_8bit=True
-                )
+                "transformer": DiffusersBnBConfig(load_in_8bit=True),
+                "text_encoder": TransformersBnBConfig(load_in_8bit=True)
             }
         )
 
-    # Bitsandbytes 4-bit quantization
     elif quant_method == 'bnb-4bit':
+        bnb_4bit_kwargs = dict(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True
+        )
         return PipelineQuantizationConfig(
             quant_mapping={
-                "transformer": DiffusersBnBConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_quant_type="nf4",
-                    bnb_4bit_compute_dtype=torch.bfloat16,
-                    bnb_4bit_use_double_quant=True
-                ),
-                "text_encoder": TransformersBnBConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_quant_type="nf4",
-                    bnb_4bit_compute_dtype=torch.bfloat16,
-                    bnb_4bit_use_double_quant=True
-                )
+                "transformer": DiffusersBnBConfig(**bnb_4bit_kwargs),
+                "text_encoder": TransformersBnBConfig(**bnb_4bit_kwargs)
             }
         )
 
-    # torchao int8 weight-only
-    elif quant_method == 'torchao-int8wo':
+    elif quant_method in torchao_methods:
+        ao_type = torchao_methods[quant_method]
         return PipelineQuantizationConfig(
             quant_mapping={
-                "transformer": TorchAoConfig("int8wo"),
-                "text_encoder": TorchAoConfig("int8wo")
-            }
-        )
-
-    # torchao fp4 (e2m1)
-    elif quant_method == 'torchao-fp4':
-        return PipelineQuantizationConfig(
-            quant_mapping={
-                "transformer": TorchAoConfig("fp4_e2m1"),
-                "text_encoder": TorchAoConfig("fp4_e2m1")
-            }
-        )
-
-    # torchao float8 weight-only
-    elif quant_method == 'torchao-float8wo':
-        return PipelineQuantizationConfig(
-            quant_mapping={
-                "transformer": TorchAoConfig("float8wo"),
-                "text_encoder": TorchAoConfig("float8wo")
+                "transformer": TorchAoConfig(ao_type),
+                "text_encoder": TorchAoConfig(ao_type)
             }
         )
 

From 422f42ca7faa1d0834b1b503e87d605ad55f1ef8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Apr 2026 23:51:15 -0700
Subject: [PATCH 62/76] Pre-compile LaTeX regex in html_generator.py

---
 modules/html_generator.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 8dd46850..e3ebea8d 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -14,6 +14,13 @@ from modules.reasoning import extract_reasoning
 from modules.sane_markdown_lists import SaneListExtension
 from modules.utils import get_available_chat_styles
 
+# Pre-compiled regex for protecting markdown-sensitive characters inside LaTeX.
+# Covers $$...$$, \[...\], \(...\), and inline $...$ (when content contains \\).
+_LATEX_PATTERN = re.compile(
+    r'((?:^|[\r\n\s])\$\$[^`]*?\$\$)|\\\[(.*?)\\\]|\\\((.*?)\\\)|(?<!\$)\$(?!\$)([^\$\n]*\\\\[^\$\n]*?)\$(?!\$)',
+    re.DOTALL
+)
+
 # This is to store the paths to the thumbnails of the profile pictures
 image_cache = {}
 
@@ -241,9 +248,7 @@ def process_markdown_content(string):
     string = re.sub(r"(.)```", r"\1\n```", string)
 
     # Protect asterisks and underscores within all LaTeX blocks before markdown conversion
-    latex_pattern = re.compile(r'((?:^|[\r\n\s])\$\$[^`]*?\$\$)|\\\[(.*?)\\\]|\\\((.*?)\\\)|(?<!\$)\$(?!\$)([^\$\n]*\\\\[^\$\n]*?)\$(?!\$)',
-                               re.DOTALL)
-    string = latex_pattern.sub(protect_asterisks_underscores_in_latex, string)
+    string = _LATEX_PATTERN.sub(protect_asterisks_underscores_in_latex, string)
 
     result = ''
     is_code = False

From d78fc46114a4ce1de505fc286798372ddaa0c32d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 5 Apr 2026 05:55:39 -0700
Subject: [PATCH 63/76] Fix "address already in use" on server restart
 (Linux/macOS)

---
 modules/api/script.py       | 26 ++++++++++++++++++++++++--
 modules/llama_cpp_server.py |  1 +
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/modules/api/script.py b/modules/api/script.py
index beed3d06..14e2d03a 100644
--- a/modules/api/script.py
+++ b/modules/api/script.py
@@ -591,9 +591,31 @@ def run_server():
     if shared.args.admin_key and shared.args.admin_key != shared.args.api_key:
         logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')
 
-    # Start server
+    # Use SO_REUSEADDR to avoid "address already in use" after restart
     logging.getLogger("uvicorn.error").propagate = False
-    uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
+    sockets = []
+    try:
+        for addr in server_addrs:
+            family = socket.AF_INET6 if ':' in addr else socket.AF_INET
+            sock = socket.socket(family, socket.SOCK_STREAM)
+            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            if family == socket.AF_INET6:
+                sock.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, 1)
+            sock.bind((addr.strip('[]'), port))
+            sock.listen(socket.SOMAXCONN)
+            sockets.append(sock)
+    except Exception:
+        for s in sockets:
+            s.close()
+        raise
+
+    config = uvicorn.Config(app, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
+    server = uvicorn.Server(config)
+    try:
+        server.run(sockets=sockets)
+    finally:
+        for s in sockets:
+            s.close()
 
 
 _server_started = False
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 34080466..c01f5d5b 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -373,6 +373,7 @@ class LlamaServer:
         """Check if a port is available for use."""
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
             try:
+                s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                 s.bind(('', port))
                 return True
             except OSError:

From f8db23b36286b09155e08beaa07a5797c879c7ef Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 5 Apr 2026 17:12:28 -0700
Subject: [PATCH 64/76] Call ik portable build folders
 text-generation-webui-ik-version

---
 .github/workflows/build-portable-release-ik-cuda.yml | 12 ++++++------
 .github/workflows/build-portable-release-ik.yml      | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml
index 331a7653..a336a1cb 100644
--- a/.github/workflows/build-portable-release-ik-cuda.yml
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@@ -102,8 +102,8 @@ jobs:
             VERSION_CLEAN="${{ inputs.version }}"
             VERSION_CLEAN="${VERSION_CLEAN#v}"
             cd ..
-            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
-            cd "text-generation-webui-${VERSION_CLEAN}"
+            cp -r text-generation-webui "text-generation-webui-ik-${VERSION_CLEAN}"
+            cd "text-generation-webui-ik-${VERSION_CLEAN}"
 
             # Remove extensions that need additional requirements
             allowed=("character_bias" "gallery" "sd_api_pictures")
@@ -133,10 +133,10 @@ jobs:
             echo "Downloading Python for $PLATFORM..."
             curl -L -o python-build.tar.gz "$PYTHON_URL"
             tar -xzf python-build.tar.gz
-            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+            mv python "text-generation-webui-ik-${VERSION_CLEAN}/portable_env"
 
             # 3. Prepare requirements file based on CUDA version
-            cd "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-ik-${VERSION_CLEAN}"
             if [[ "$CUDA_VERSION" == "13.1" ]]; then
                 REQ_FILE="requirements/portable/requirements_ik_cuda131.txt"
             else
@@ -158,11 +158,11 @@ jobs:
             if [[ "$RUNNER_OS" == "Windows" ]]; then
                 ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
                 echo "Creating archive: $ARCHIVE_NAME"
-                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-ik-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
             else
                 ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.tar.gz"
                 echo "Creating archive: $ARCHIVE_NAME"
-                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-ik-${VERSION_CLEAN}"
             fi
 
       - name: Upload files to a GitHub release
diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml
index bf54eb0e..5eaf7c86 100644
--- a/.github/workflows/build-portable-release-ik.yml
+++ b/.github/workflows/build-portable-release-ik.yml
@@ -101,8 +101,8 @@ jobs:
             VERSION_CLEAN="${{ inputs.version }}"
             VERSION_CLEAN="${VERSION_CLEAN#v}"
             cd ..
-            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
-            cd "text-generation-webui-${VERSION_CLEAN}"
+            cp -r text-generation-webui "text-generation-webui-ik-${VERSION_CLEAN}"
+            cd "text-generation-webui-ik-${VERSION_CLEAN}"
 
             # Remove extensions that need additional requirements
             allowed=("character_bias" "gallery" "sd_api_pictures")
@@ -131,10 +131,10 @@ jobs:
             cd ..
             curl -L -o python-build.tar.gz "$PYTHON_URL"
             tar -xzf python-build.tar.gz
-            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+            mv python "text-generation-webui-ik-${VERSION_CLEAN}/portable_env"
 
             # 3. Prepare requirements file
-            cd "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-ik-${VERSION_CLEAN}"
             REQ_FILE="requirements/portable/requirements_ik_cpu_only.txt"
             echo "Using requirements file: $REQ_FILE"
 
@@ -153,11 +153,11 @@ jobs:
             if [[ "$RUNNER_OS" == "Windows" ]]; then
                 ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.zip"
                 echo "Creating archive: $ARCHIVE_NAME"
-                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-ik-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
             else
                 ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.tar.gz"
                 echo "Creating archive: $ARCHIVE_NAME"
-                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-ik-${VERSION_CLEAN}"
             fi
 
       - name: Upload files to a GitHub release

From 223dd4b8017d24f7c5c2f33be2ca8409e1897b34 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 5 Apr 2026 18:22:50 -0700
Subject: [PATCH 65/76] UI: Hide spin buttons on number inputs

---
 css/main.css | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/css/main.css b/css/main.css
index 7f47a3aa..db0b781b 100644
--- a/css/main.css
+++ b/css/main.css
@@ -22,6 +22,17 @@
     font-style: italic;
 }
 
+/* Hide spin buttons on number inputs (look bad on Windows) */
+input[type="number"]::-webkit-outer-spin-button,
+input[type="number"]::-webkit-inner-spin-button {
+    -webkit-appearance: none;
+    margin: 0;
+}
+
+input[type="number"] {
+    -moz-appearance: textfield;
+}
+
 .padded.svelte-12cmxck {
     padding: 3px 0;
 }

From abc3487f4dec9215abd9ebfb5ac796c32361b018 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 5 Apr 2026 18:24:26 -0700
Subject: [PATCH 66/76] UI: Move cpu-moe checkbox to extra flags (no longer
 useful now that --fit exists)

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 243079a0..9c8306f5 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -54,7 +54,6 @@ def create_ui():
                             if not shared.args.portable:
                                 shared.gradio['ik'] = gr.Checkbox(label="ik", value=shared.args.ik, info='Use ik_llama.cpp instead of upstream llama.cpp.')
 
-                            shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
@@ -109,6 +108,7 @@ def create_ui():
                             with gr.Column():
                                 shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='Use PyTorch in CPU mode.')
                                 shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
+                                shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                                 shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
                                 shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces performance.')
                                 shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)

From b1d06dcf96e2b5958ae004b8c9bbb0fc8518328b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 5 Apr 2026 23:07:14 -0300
Subject: [PATCH 67/76] UI: Add MCP server support

---
 README.md                                     |   2 +-
 docs/Tool Calling Tutorial.md                 |  13 ++
 modules/chat.py                               |  15 ++-
 modules/shared.py                             |   1 +
 modules/tool_use.py                           | 114 ++++++++++++++++++
 modules/ui.py                                 |   2 +
 modules/ui_chat.py                            |   3 +
 requirements/full/requirements.txt            |   1 +
 requirements/full/requirements_amd.txt        |   1 +
 .../full/requirements_apple_intel.txt         |   1 +
 .../full/requirements_apple_silicon.txt       |   1 +
 requirements/full/requirements_cpu_only.txt   |   1 +
 requirements/full/requirements_nowheels.txt   |   1 +
 requirements/portable/requirements.txt        |   1 +
 requirements/portable/requirements_amd.txt    |   1 +
 .../portable/requirements_apple_intel.txt     |   1 +
 .../portable/requirements_apple_silicon.txt   |   1 +
 .../portable/requirements_cpu_only.txt        |   1 +
 .../portable/requirements_cuda131.txt         |   1 +
 requirements/portable/requirements_ik.txt     |   1 +
 .../portable/requirements_ik_cpu_only.txt     |   1 +
 .../portable/requirements_ik_cuda131.txt      |   1 +
 .../portable/requirements_nowheels.txt        |   1 +
 requirements/portable/requirements_vulkan.txt |   1 +
 24 files changed, 163 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 23cd09c5..b168ebdb 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ A Gradio web UI for running Large Language Models locally. 100% private and offl
 - **Easy setup**: [Portable builds](https://github.com/oobabooga/text-generation-webui/releases) (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or a one-click installer for the full feature set.
 - **Multiple backends**: [llama.cpp](https://github.com/ggerganov/llama.cpp), [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). Switch between backends and models without restarting.
 - **OpenAI/Anthropic-compatible API**: Chat, Completions, and Messages endpoints with tool-calling support. Use as a local drop-in replacement for the OpenAI/Anthropic APIs ([examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples)).
-- **Tool-calling**: Models can call custom functions during chat — web search, page fetching, math, and more. Each tool is a single `.py` file, easy to create and extend ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Tool-Calling-Tutorial)).
+- **Tool-calling**: Models can call custom functions during chat — web search, page fetching, math, and more. Each tool is a single `.py` file. MCP servers are also supported ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Tool-Calling-Tutorial)).
 - **Vision (multimodal)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
 - **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
 - **Training**: Fine-tune LoRAs on multi-turn chat or raw text datasets. Supports resuming interrupted runs ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/05-%E2%80%90-Training-Tab)).
diff --git a/docs/Tool Calling Tutorial.md b/docs/Tool Calling Tutorial.md
index d95a9c80..7d2a86de 100644
--- a/docs/Tool Calling Tutorial.md	
+++ b/docs/Tool Calling Tutorial.md	
@@ -80,6 +80,19 @@ def execute(arguments):
 
 You can open the built-in tools in `user_data/tools/` for more examples.
 
+## MCP servers
+
+You can connect to remote [MCP (Model Context Protocol)](https://modelcontextprotocol.io/) servers to use their tools alongside local ones.
+
+In the chat sidebar, open the **MCP servers** accordion and enter one server URL per line. For servers that require authentication, append headers after the URL separated by commas:
+
+```
+https://example.com/mcp
+https://other.com/mcp,Authorization: Bearer sk-xxx
+```
+
+All tools from the configured servers are automatically discovered and made available to the model during generation. If an MCP tool has the same name as a selected local tool, the local tool takes priority.
+
 ## Tool calling over the API
 
 Tool calling over the API follows the [OpenAI API](https://platform.openai.com/docs/guides/function-calling) convention. Define your tools, send them with your messages, and handle tool calls in a loop until the model gives a final answer.
diff --git a/modules/chat.py b/modules/chat.py
index 76b8694a..aeed688d 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1264,14 +1264,23 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
 
     # Load tools if any are selected
     selected = state.get('selected_tools', [])
+    mcp_servers = state.get('mcp_servers', '')
     parse_tool_call = None
     _tool_parsers = None
-    if selected:
-        from modules.tool_use import load_tools, execute_tool
+    if selected or mcp_servers:
+        from modules.tool_use import load_tools, load_mcp_tools, execute_tool
         from modules.tool_parsing import parse_tool_call, get_tool_call_id, detect_tool_call_format
 
-    if selected:
         tool_defs, tool_executors = load_tools(selected)
+        if mcp_servers:
+            mcp_defs, mcp_executors = load_mcp_tools(mcp_servers)
+            for td in mcp_defs:
+                fn = td['function']['name']
+                if fn in tool_executors:
+                    logger.warning(f'MCP tool "{fn}" conflicts with a local tool. Skipping.')
+                    continue
+                tool_defs.append(td)
+                tool_executors[fn] = mcp_executors[fn]
         state['tools'] = tool_defs
         tool_func_names = [t['function']['name'] for t in tool_defs]
         _template_str = state.get('instruction_template_str', '') if state.get('mode') == 'instruct' else state.get('chat_template_str', '')
diff --git a/modules/shared.py b/modules/shared.py
index 13843f0c..92c4f56c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -259,6 +259,7 @@ settings = {
     'enable_web_search': False,
     'web_search_pages': 3,
     'selected_tools': [],
+    'mcp_servers': '',
     'prompt-notebook': '',
     'preset': 'Top-P' if (user_data_dir / 'presets/Top-P.yaml').exists() else None,
     'max_new_tokens': 512,
diff --git a/modules/tool_use.py b/modules/tool_use.py
index e22b1798..f9ddf940 100644
--- a/modules/tool_use.py
+++ b/modules/tool_use.py
@@ -1,3 +1,4 @@
+import asyncio
 import importlib.util
 import json
 
@@ -55,6 +56,119 @@ def load_tools(selected_names):
     return tool_defs, executors
 
 
+def _parse_mcp_servers(servers_str):
+    """Parse MCP servers textbox: one server per line, format 'url' or 'url,Header: value,Header2: value2'."""
+    servers = []
+    for line in servers_str.strip().splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        parts = line.split(',')
+        url = parts[0].strip()
+        headers = {}
+        for part in parts[1:]:
+            part = part.strip()
+            if ':' in part:
+                key, val = part.split(':', 1)
+                headers[key.strip()] = val.strip()
+        servers.append((url, headers))
+    return servers
+
+
+def _mcp_tool_to_openai(tool):
+    """Convert an MCP Tool object to OpenAI-format tool dict."""
+    return {
+        "type": "function",
+        "function": {
+            "name": tool.name,
+            "description": tool.description or "",
+            "parameters": tool.inputSchema or {"type": "object", "properties": {}}
+        }
+    }
+
+
+async def _mcp_session(url, headers, callback):
+    """Open an MCP session and pass it to the callback."""
+    from mcp.client.streamable_http import streamablehttp_client
+    from mcp import ClientSession
+
+    async with streamablehttp_client(url, headers=headers or None) as (read_stream, write_stream, _):
+        async with ClientSession(read_stream, write_stream) as session:
+            await session.initialize()
+            return await callback(session)
+
+
+def _make_mcp_executor(name, url, headers):
+    def executor(arguments):
+        return asyncio.run(_call_mcp_tool(name, arguments, url, headers))
+    return executor
+
+
+async def _connect_mcp_server(url, headers):
+    """Connect to one MCP server and return (tool_defs, executors)."""
+
+    async def _discover(session):
+        result = await session.list_tools()
+        tool_defs = []
+        executors = {}
+        for tool in result.tools:
+            tool_defs.append(_mcp_tool_to_openai(tool))
+            executors[tool.name] = _make_mcp_executor(tool.name, url, headers)
+        return tool_defs, executors
+
+    return await _mcp_session(url, headers, _discover)
+
+
+async def _call_mcp_tool(name, arguments, url, headers):
+    """Connect to an MCP server and call a single tool."""
+
+    async def _invoke(session):
+        result = await session.call_tool(name, arguments)
+        parts = []
+        for content in result.content:
+            if hasattr(content, 'text'):
+                parts.append(content.text)
+            else:
+                parts.append(str(content))
+        return '\n'.join(parts) if parts else ''
+
+    return await _mcp_session(url, headers, _invoke)
+
+
+async def _connect_all_mcp_servers(servers):
+    """Connect to all MCP servers concurrently."""
+    results = await asyncio.gather(
+        *(_connect_mcp_server(url, headers) for url, headers in servers),
+        return_exceptions=True
+    )
+    all_defs = []
+    all_executors = {}
+    for (url, _), result in zip(servers, results):
+        if isinstance(result, Exception):
+            logger.exception(f'Failed to connect to MCP server "{url}"', exc_info=result)
+            continue
+        defs, execs = result
+        for td, (fn, ex) in zip(defs, execs.items()):
+            if fn in all_executors:
+                logger.warning(f'MCP tool "{fn}" from {url} conflicts with an already loaded tool. Skipping.')
+                continue
+            all_defs.append(td)
+            all_executors[fn] = ex
+    return all_defs, all_executors
+
+
+def load_mcp_tools(servers_str):
+    """
+    Parse MCP servers string and discover tools from each server.
+    Returns (tool_defs, executors) in the same format as load_tools.
+    """
+    servers = _parse_mcp_servers(servers_str)
+    if not servers:
+        return [], {}
+
+    return asyncio.run(_connect_all_mcp_servers(servers))
+
+
 def execute_tool(func_name, arguments, executors):
     """Execute a tool by function name. Returns result as a JSON string."""
     fn = executors.get(func_name)
diff --git a/modules/ui.py b/modules/ui.py
index 73072cbe..3a8390f7 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -209,6 +209,7 @@ def list_interface_input_elements():
         'textbox',
         'start_with',
         'selected_tools',
+        'mcp_servers',
         'mode',
         'chat_style',
         'chat-instruct_command',
@@ -434,6 +435,7 @@ def setup_auto_save():
         'custom_system_message',
         'chat_template_str',
         'selected_tools',
+        'mcp_servers',
 
         # Parameters tab (ui_parameters.py) - Generation parameters
         'preset_menu',
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index d9652253..14489d96 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -105,6 +105,9 @@ def create_ui():
 
                 shared.gradio['selected_tools'].change(fn=sync_web_tools, inputs=[shared.gradio['selected_tools']], outputs=[shared.gradio['selected_tools']], show_progress=False)
 
+                with gr.Accordion('MCP servers', open=False):
+                    shared.gradio['mcp_servers'] = gr.Textbox(value=shared.settings.get('mcp_servers', ''), lines=3, max_lines=3, label='', info='One url per line. For headers, write url,Header: value,Header2: value2', elem_classes=['add_scrollbar'])
+
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
                 with gr.Row():
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 9f83830a..104cfdb2 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -48,3 +48,4 @@ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
+mcp==1.27.0
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index b4b8386e..49db44db 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -39,3 +39,4 @@ tiktoken
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+mcp==1.27.0
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 41ee6a60..4584708f 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -38,3 +38,4 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+mcp==1.27.0
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 8be2f55e..4376a2b4 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -38,3 +38,4 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+mcp==1.27.0
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index d7f1bf13..2999d4a9 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -41,3 +41,4 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+mcp==1.27.0
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 7b331f96..5a1e504e 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -35,3 +35,4 @@ https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_cl
 flask_cloudflared==0.0.15
 sse-starlette==1.6.5
 tiktoken
+mcp==1.27.0
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index b467cf26..fb51c7cc 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -25,3 +25,4 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+mcp==1.27.0
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 4eca16e1..dbea7597 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -25,3 +25,4 @@ tiktoken
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+mcp==1.27.0
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 55f8d3f8..d0f83a74 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -24,3 +24,4 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+mcp==1.27.0
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 54e8f350..160c0646 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -24,3 +24,4 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+mcp==1.27.0
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index f073a614..21695585 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -25,3 +25,4 @@ tiktoken
 # llama.cpp (CPU only)
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+mcp==1.27.0
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 8cd40f39..6b09a46b 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -25,3 +25,4 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+mcp==1.27.0
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index fbb9125d..ca5ece2d 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -25,3 +25,4 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+mcp==1.27.0
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index 59fcfae1..f8bafb27 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -25,3 +25,4 @@ tiktoken
 # ik_llama.cpp (CPU only)
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+mcp==1.27.0
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index ffdbe568..7825b959 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -25,3 +25,4 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+mcp==1.27.0
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 4a47b1f0..cde036d9 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -21,3 +21,4 @@ https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_cl
 flask_cloudflared==0.0.15
 sse-starlette==1.6.5
 tiktoken
+mcp==1.27.0
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 97abd933..32f9e593 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -25,3 +25,4 @@ tiktoken
 # Vulkan wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+mcp==1.27.0

From 05e484203308adb3324f7a9edd1412ed9762e359 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 5 Apr 2026 20:03:06 -0700
Subject: [PATCH 68/76] Fix image generation: default to SDPA attention backend

---
 modules/image_models.py | 2 +-
 modules/shared.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/image_models.py b/modules/image_models.py
index eed8783c..e244c3c8 100644
--- a/modules/image_models.py
+++ b/modules/image_models.py
@@ -129,7 +129,7 @@ def load_image_model(model_name, dtype='bfloat16', attn_backend='sdpa', cpu_offl
 
         modules = ["transformer", "unet"]
 
-        # Set attention backend
+        # Set attention backend (diffusers defaults to native/SDPA)
         if attn_backend == 'flash_attention_2':
             for name in modules:
                 mod = getattr(pipe, name, None)
diff --git a/modules/shared.py b/modules/shared.py
index 92c4f56c..e04f28f3 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -364,7 +364,7 @@ settings = {
     'image_llm_variations_prompt': 'Write a variation of the image generation prompt above. Consider the intent of the user with that prompt and write something that will likely please them, with added details. Output only the new prompt. Do not add any explanations, prefixes, or additional text.',
     'image_model_menu': 'None',
     'image_dtype': 'bfloat16',
-    'image_attn_backend': 'flash_attention_2',
+    'image_attn_backend': 'sdpa',
     'image_cpu_offload': False,
     'image_compile': False,
     'image_quant': 'none',

From 7b2f15e34ae57a6e86b0901482b4ed9b6b52ad8a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 5 Apr 2026 21:16:32 -0700
Subject: [PATCH 69/76] Minor change after
 b1d06dcf96e2b5958ae004b8c9bbb0fc8518328b

---
 requirements/full/requirements.txt                   | 2 +-
 requirements/full/requirements_amd.txt               | 2 +-
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 2 +-
 requirements/full/requirements_nowheels.txt          | 2 +-
 requirements/portable/requirements.txt               | 2 +-
 requirements/portable/requirements_amd.txt           | 2 +-
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 2 +-
 requirements/portable/requirements_cuda131.txt       | 2 +-
 requirements/portable/requirements_ik.txt            | 2 +-
 requirements/portable/requirements_ik_cpu_only.txt   | 2 +-
 requirements/portable/requirements_ik_cuda131.txt    | 2 +-
 requirements/portable/requirements_nowheels.txt      | 2 +-
 requirements/portable/requirements_vulkan.txt        | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 104cfdb2..d466e7e3 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -9,6 +9,7 @@ flash-linear-attention==0.4.*
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pandas
 peft==0.18.*
@@ -48,4 +49,3 @@ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
-mcp==1.27.0
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 49db44db..e88ff7c5 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -7,6 +7,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pandas
 peft==0.18.*
@@ -39,4 +40,3 @@ tiktoken
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-mcp==1.27.0
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 4584708f..eefd979e 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -7,6 +7,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pandas
 peft==0.18.*
@@ -38,4 +39,3 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
-mcp==1.27.0
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 4376a2b4..d1b4e09f 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -7,6 +7,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pandas
 peft==0.18.*
@@ -38,4 +39,3 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
-mcp==1.27.0
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 2999d4a9..156ceb77 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -7,6 +7,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pandas
 peft==0.18.*
@@ -41,4 +42,3 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-mcp==1.27.0
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 5a1e504e..19ac5183 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -7,6 +7,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pandas
 peft==0.18.*
@@ -35,4 +36,3 @@ https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_cl
 flask_cloudflared==0.0.15
 sse-starlette==1.6.5
 tiktoken
-mcp==1.27.0
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index fb51c7cc..8a158f05 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -25,4 +26,3 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-mcp==1.27.0
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index dbea7597..a4949a46 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -25,4 +26,3 @@ tiktoken
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-mcp==1.27.0
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index d0f83a74..227823a6 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -24,4 +25,3 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
-mcp==1.27.0
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 160c0646..9779dd4a 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -24,4 +25,3 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
-mcp==1.27.0
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 21695585..ff84907a 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -25,4 +26,3 @@ tiktoken
 # llama.cpp (CPU only)
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-mcp==1.27.0
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 6b09a46b..89e43e1a 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -25,4 +26,3 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-mcp==1.27.0
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index ca5ece2d..a23d8ff0 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -25,4 +26,3 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-mcp==1.27.0
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index f8bafb27..a200e80f 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -25,4 +26,3 @@ tiktoken
 # ik_llama.cpp (CPU only)
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-mcp==1.27.0
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 7825b959..8e9a097b 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -25,4 +26,3 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-mcp==1.27.0
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index cde036d9..cafe3cee 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -21,4 +22,3 @@ https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.19/gradio_cl
 flask_cloudflared==0.0.15
 sse-starlette==1.6.5
 tiktoken
-mcp==1.27.0
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 32f9e593..59524668 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -3,6 +3,7 @@ fastapi==0.112.4
 huggingface-hub==1.5.*
 jinja2==3.1.6
 markdown
+mcp==1.27.0
 numpy==2.2.*
 pydantic==2.11.0
 pymupdf==1.27.*
@@ -25,4 +26,3 @@ tiktoken
 # Vulkan wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-mcp==1.27.0

From 4d6230a944a71dab794d880d7c353eb37934d584 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 6 Apr 2026 06:48:48 -0700
Subject: [PATCH 70/76] Follow-up to d78fc46114a4ce1de505fc286798372ddaa0c32d

---
 modules/api/script.py | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

diff --git a/modules/api/script.py b/modules/api/script.py
index 14e2d03a..e79a1967 100644
--- a/modules/api/script.py
+++ b/modules/api/script.py
@@ -532,8 +532,8 @@ async def handle_unload_loras():
 def find_available_port(starting_port):
     """Try the starting port, then find an available one if it's taken."""
     try:
-        # Try to create a socket with the starting port
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
             s.bind(('', starting_port))
             return starting_port
     except OSError:
@@ -591,31 +591,9 @@ def run_server():
     if shared.args.admin_key and shared.args.admin_key != shared.args.api_key:
         logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')
 
-    # Use SO_REUSEADDR to avoid "address already in use" after restart
+    # Start server
     logging.getLogger("uvicorn.error").propagate = False
-    sockets = []
-    try:
-        for addr in server_addrs:
-            family = socket.AF_INET6 if ':' in addr else socket.AF_INET
-            sock = socket.socket(family, socket.SOCK_STREAM)
-            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            if family == socket.AF_INET6:
-                sock.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, 1)
-            sock.bind((addr.strip('[]'), port))
-            sock.listen(socket.SOMAXCONN)
-            sockets.append(sock)
-    except Exception:
-        for s in sockets:
-            s.close()
-        raise
-
-    config = uvicorn.Config(app, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
-    server = uvicorn.Server(config)
-    try:
-        server.run(sockets=sockets)
-    finally:
-        for s in sockets:
-            s.close()
+    uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
 
 
 _server_started = False

From c26ffdd24c60b1dc6ad339c847b8993f490dc036 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 6 Apr 2026 07:02:53 -0700
Subject: [PATCH 71/76] API: add instruction_template support to the model load
 endpoint

---
 docs/12 - OpenAI API.md    | 11 +++++++++++
 modules/api/models.py      | 10 +++++++++-
 modules/api/script.py      |  5 +++++
 modules/api/typing.py      |  2 ++
 modules/models_settings.py | 17 +++++++++++------
 5 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 0a076c35..727f6ece 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -232,6 +232,17 @@ curl -k http://127.0.0.1:5000/v1/internal/model/load \
   }'
 ```
 
+You can also set a default instruction template for all subsequent API requests by passing `instruction_template` (a template name from `user_data/instruction-templates/`) or `instruction_template_str` (a raw Jinja2 string):
+
+```shell
+curl -k http://127.0.0.1:5000/v1/internal/model/load \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model_name": "Qwen_Qwen3-0.6B-Q4_K_M.gguf",
+    "instruction_template": "Alpaca"
+  }'
+```
+
 #### Python chat example
 
 ```python
diff --git a/modules/api/models.py b/modules/api/models.py
index 5dd77850..bfcd2c31 100644
--- a/modules/api/models.py
+++ b/modules/api/models.py
@@ -1,7 +1,8 @@
 from modules import loaders, shared
+from modules.logging_colors import logger
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
-from modules.models_settings import get_model_metadata, update_model_parameters
+from modules.models_settings import get_model_metadata, load_instruction_template, update_model_parameters
 from modules.utils import get_available_loras, get_available_models
 
 
@@ -69,6 +70,13 @@ def _load_model(data):
 
     shared.model, shared.tokenizer = load_model(model_name)
 
+    if data.get("instruction_template_str") is not None:
+        shared.settings['instruction_template_str'] = data["instruction_template_str"]
+        logger.info("INSTRUCTION TEMPLATE: set to custom Jinja2 string")
+    elif data.get("instruction_template") is not None:
+        shared.settings['instruction_template_str'] = load_instruction_template(data["instruction_template"])
+        logger.info(f"INSTRUCTION TEMPLATE: {data['instruction_template']}")
+
 
 def list_loras():
     return {'lora_names': get_available_loras()[1:]}
diff --git a/modules/api/script.py b/modules/api/script.py
index e79a1967..1f41d0cd 100644
--- a/modules/api/script.py
+++ b/modules/api/script.py
@@ -487,6 +487,11 @@ async def handle_load_model(request_data: LoadModelRequest):
 
     Loader args are reset to their startup defaults between loads, so
     settings from a previous load do not leak into the next one.
+
+    The "instruction_template" parameter sets the default instruction
+    template by name (from user_data/instruction-templates/). The
+    "instruction_template_str" parameter sets it as a raw Jinja2 string
+    and takes precedence over "instruction_template".
     '''
 
     try:
diff --git a/modules/api/typing.py b/modules/api/typing.py
index a758743e..56d7f2bc 100644
--- a/modules/api/typing.py
+++ b/modules/api/typing.py
@@ -271,6 +271,8 @@ class ModelListResponse(BaseModel):
 class LoadModelRequest(BaseModel):
     model_name: str
     args: dict | None = None
+    instruction_template: str | None = Field(default=None, description="An instruction template defined under text-generation-webui/user_data/instruction-templates. Sets the default template for all subsequent API requests.")
+    instruction_template_str: str | None = Field(default=None, description="A Jinja2 instruction template string. If set, takes precedence over instruction_template.")
 
 
 class LoraListResponse(BaseModel):
diff --git a/modules/models_settings.py b/modules/models_settings.py
index eafa0581..b10d780c 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -400,14 +400,19 @@ def load_instruction_template(template):
     if template == 'None':
         return ''
 
-    for filepath in [shared.user_data_dir / 'instruction-templates' / f'{template}.yaml', shared.user_data_dir / 'instruction-templates' / 'Alpaca.yaml']:
-        if filepath.exists():
-            break
+    for name in (template, 'Alpaca'):
+        path = shared.user_data_dir / 'instruction-templates' / f'{name}.yaml'
+        try:
+            with open(path, 'r', encoding='utf-8') as f:
+                file_contents = f.read()
+        except FileNotFoundError:
+            if name == template:
+                logger.warning(f"Instruction template '{template}' not found, falling back to Alpaca")
+            continue
+
+        break
     else:
         return ''
-
-    with open(filepath, 'r', encoding='utf-8') as f:
-        file_contents = f.read()
     data = yaml.safe_load(file_contents)
     if 'instruction_template' in data:
         return data['instruction_template']

From 193424cc9359859b5b97bf5b229409a3fb727274 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 6 Apr 2026 10:07:52 -0700
Subject: [PATCH 72/76] API: Fix IPv6 address formatting

---
 modules/api/script.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/api/script.py b/modules/api/script.py
index 1f41d0cd..ceeca2dc 100644
--- a/modules/api/script.py
+++ b/modules/api/script.py
@@ -563,7 +563,7 @@ def run_server():
         server_addrs.append(shared.args.listen_host)
     else:
         if os.environ.get('OPENEDAI_ENABLE_IPV6', shared.args.api_enable_ipv6):
-            server_addrs.append('[::]' if shared.args.listen else '[::1]')
+            server_addrs.append('::' if shared.args.listen else '::1')
         if not os.environ.get('OPENEDAI_DISABLE_IPV4', shared.args.api_disable_ipv4):
             server_addrs.append('0.0.0.0' if shared.args.listen else '127.0.0.1')
 
@@ -580,7 +580,7 @@ def run_server():
         )
     else:
         url_proto = 'https://' if (ssl_certfile and ssl_keyfile) else 'http://'
-        urls = [f'{url_proto}{addr}:{port}/v1' for addr in server_addrs]
+        urls = [f'{url_proto}[{addr}]:{port}/v1' if ':' in addr else f'{url_proto}{addr}:{port}/v1' for addr in server_addrs]
         if len(urls) > 1:
             logger.info('OpenAI/Anthropic-compatible API URLs:\n\n' + '\n'.join(urls) + '\n')
         else:

From cb511928e2be4b7ee234582ecba96801fccf94fe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 6 Apr 2026 12:06:28 -0700
Subject: [PATCH 73/76] Fix GPT-OSS tag leak during streaming between thinking
 and tool calls

---
 modules/reasoning.py    | 13 ++++++++++---
 modules/tool_parsing.py | 12 +++++++++---
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/modules/reasoning.py b/modules/reasoning.py
index 4a7cfa79..2b260818 100644
--- a/modules/reasoning.py
+++ b/modules/reasoning.py
@@ -73,9 +73,16 @@ def extract_reasoning(text, html_escaped=False):
                 if content_pos != -1:
                     content_start = content_pos + len(content_esc)
                 else:
-                    # Content tag not present — fall back to content after
-                    # end_tag (e.g. GPT-OSS tool calls skip the final channel).
-                    content_start = end_pos + len(end_esc)
+                    # Content tag not present yet.  In GPT-OSS the region
+                    # between <|end|> and the content tag contains internal
+                    # markup (<|start|>assistant…) that must not be shown.
+                    # Suppress it to prevent tag leaks during streaming.
+                    remainder = text[end_pos + len(end_esc):].lstrip()
+                    framing_token = esc('<|start|>')
+                    if not remainder or remainder.startswith(framing_token) or framing_token.startswith(remainder):
+                        content_start = len(text)
+                    else:
+                        content_start = end_pos + len(end_esc)
             else:
                 content_start = end_pos + len(end_esc)
 
diff --git a/modules/tool_parsing.py b/modules/tool_parsing.py
index 7fcf58b7..aa3e0e95 100644
--- a/modules/tool_parsing.py
+++ b/modules/tool_parsing.py
@@ -638,9 +638,15 @@ def parse_tool_call(answer: str, tool_names: list[str], return_prefix: bool = Fa
     # Strip thinking blocks so tool-call syntax inside <think> is ignored.
     original_answer = answer
     _, answer = extract_reasoning(answer)
-    # Offset between original and stripped text, used to map start_pos
-    # back to the original string when returning a prefix.
-    reasoning_offset = len(original_answer) - len(answer)
+    # Reasoning extraction returns empty content when GPT-OSS internal
+    # markup (<|start|>assistant…) follows the thinking block without a
+    # content tag.  Fall back to the full text so tool-call markers can
+    # be found.
+    if not answer.strip():
+        answer = original_answer
+        reasoning_offset = 0
+    else:
+        reasoning_offset = len(original_answer) - len(answer)
 
     matches = []
     start_pos = None

From 775c913de20824d187f677e65845fe8680ecd7f6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 6 Apr 2026 14:13:01 -0700
Subject: [PATCH 74/76] Fix crash when truncating prompts with tool call
 messages

---
 modules/chat.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index aeed688d..7e9cce60 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -568,13 +568,24 @@ def generate_chat_prompt(user_input, state, **kwargs):
         encoded_length = get_encoded_length(prompt)
         while len(messages) > 0 and encoded_length > max_length:
 
-            # Remove old message, save system message
             if len(messages) > 2 and messages[0]['role'] == 'system':
-                messages.pop(1)
-
-            # Remove old message when no system message is present
+                pop_idx = 1
             elif len(messages) > 1 and messages[0]['role'] != 'system':
-                messages.pop(0)
+                pop_idx = 0
+            else:
+                pop_idx = None
+
+            if pop_idx is not None:
+                messages.pop(pop_idx)
+
+                # Remove orphaned tool-call/tool-result messages that
+                # would be invalid without their partner.
+                while pop_idx < len(messages):
+                    msg = messages[pop_idx]
+                    if msg.get('role') == 'tool' or (msg.get('role') == 'assistant' and msg.get('tool_calls')):
+                        messages.pop(pop_idx)
+                    else:
+                        break
 
             # Resort to truncating the user input
             else:

From 778e1c4d52cc6f86cd55207543563773b12cd2cf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 6 Apr 2026 17:04:49 -0700
Subject: [PATCH 75/76] Update llama.cpp/ik_llama.cpp

---
 requirements/full/requirements.txt                   | 8 ++++----
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 8 ++++----
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 15 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index d466e7e3..ed5841b8 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -41,10 +41,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/ik_llama_cpp_binaries-0.110.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/ik_llama_cpp_binaries-0.110.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index e88ff7c5..fe6ce28c 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -38,5 +38,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index eefd979e..09c01a61 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -38,4 +38,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d1b4e09f..42210407 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -38,4 +38,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 156ceb77..5cd7ae7d 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -38,7 +38,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/ik_llama_cpp_binaries-0.110.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/ik_llama_cpp_binaries-0.110.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 8a158f05..807ff079 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -24,5 +24,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index a4949a46..55fe79ea 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -24,5 +24,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 227823a6..6d4a63f7 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -24,4 +24,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 9779dd4a..aebb7c5b 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -24,4 +24,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index ff84907a..d7e2b051 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -24,5 +24,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 89e43e1a..42a9a16f 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -24,5 +24,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index a23d8ff0..c3fdb5e8 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -24,5 +24,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/ik_llama_cpp_binaries-0.110.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/ik_llama_cpp_binaries-0.110.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index a200e80f..ea3ba601 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -24,5 +24,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # ik_llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/ik_llama_cpp_binaries-0.110.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/ik_llama_cpp_binaries-0.110.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 8e9a097b..7530375d 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -24,5 +24,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/ik_llama_cpp_binaries-0.106.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/ik_llama_cpp_binaries-0.110.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/ik_llama_cpp_binaries-0.110.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 59524668..3b8b0573 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -24,5 +24,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.106.0/llama_cpp_binaries-0.106.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.110.0/llama_cpp_binaries-0.110.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From e18f32cba78d471dd86a924147aa3ea6638d5e97 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 6 Apr 2026 17:47:50 -0700
Subject: [PATCH 76/76] Remove hardcoded trust_remote_code=True in embedding
 loader

---
 modules/api/embeddings.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/api/embeddings.py b/modules/api/embeddings.py
index 16cf0482..17e595fb 100644
--- a/modules/api/embeddings.py
+++ b/modules/api/embeddings.py
@@ -6,6 +6,7 @@ from transformers import AutoModel
 from .errors import ServiceUnavailableError
 from .utils import debug_msg, float_list_to_base64
 from modules.logging_colors import logger
+from modules import shared
 
 embeddings_params_initialized = False
 
@@ -41,7 +42,7 @@ def load_embedding_model(model: str):
     try:
         logger.info(f"Try embedding model: {model} on {embeddings_device}")
         if 'jina-embeddings' in model:
-            embeddings_model = AutoModel.from_pretrained(model, trust_remote_code=True)  # trust_remote_code is needed to use the encode method
+            embeddings_model = AutoModel.from_pretrained(model, trust_remote_code=shared.args.trust_remote_code)
             embeddings_model = embeddings_model.to(embeddings_device)
         else:
             embeddings_model = SentenceTransformer(model, device=embeddings_device)