From 807be1183272fac409ce8f08609dbdd0d9f63362 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 18:48:50 -0700
Subject: [PATCH 01/27] Remove obsolete models/config.yaml and related code

---
 docs/01 - Chat Tab.md        |   2 +-
 docs/12 - OpenAI API.md      |   2 +-
 modules/models.py            |   1 -
 modules/models_settings.py   |   9 +-
 modules/shared.py            |  10 --
 server.py                    |   5 -
 user_data/models/config.yaml | 203 -----------------------------------
 7 files changed, 4 insertions(+), 228 deletions(-)
 delete mode 100644 user_data/models/config.yaml

diff --git a/docs/01 - Chat Tab.md b/docs/01 - Chat Tab.md
index 5104895f..96b232fa 100644
--- a/docs/01 - Chat Tab.md	
+++ b/docs/01 - Chat Tab.md	
@@ -112,7 +112,7 @@ Used for talking to an instruction-following model using the prompt format defin
 
 The prompt format is defined by the **Instruction template** parameter in "Parameters" > "Instruction template", which represents a Jinja2 template.
 
-Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any), and will update the values under "Parameters" > "Instruction template" accordingly. This is done using a set of regular expressions defined in `user_data/models/config.yaml`. This detection is not guaranteed to be accurate. You should check the model card on Hugging Face to see if you are using the correct prompt format.
+Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any) from the model metadata (e.g. `tokenizer_config.json` or GGUF metadata), and will update the values under "Parameters" > "Instruction template" accordingly. You should check the model card on Hugging Face to see if you are using the correct prompt format.
 
 ### Chat-instruct
 
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 2a7a7f69..0a076c35 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -39,7 +39,7 @@ curl http://127.0.0.1:5000/v1/completions \
 
 #### Chat completions
 
-Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be guessed automatically based on the model name using the regex patterns in `user_data/models/config.yaml`.
+Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be detected automatically from the model metadata.
 
 ```shell
 curl http://127.0.0.1:5000/v1/chat/completions \
diff --git a/modules/models.py b/modules/models.py
index 1d139b89..b2665c6b 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -67,7 +67,6 @@ def load_model(model_name, loader=None):
     logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
     logger.info(f"LOADER: \"{loader}\"")
     logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
-    logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"")
     return model, tokenizer
 
 
diff --git a/modules/models_settings.py b/modules/models_settings.py
index dcface71..eafa0581 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -23,14 +23,9 @@ def get_fallback_settings():
 
 def get_model_metadata(model):
     model_path = resolve_model_path(model)
-    model_settings = {}
 
-    # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
-    settings = shared.model_config
-    for pat in settings:
-        if re.match(pat.lower(), Path(model).name.lower()):
-            for k in settings[pat]:
-                model_settings[k] = settings[pat][k]
+    # Fallback settings
+    model_settings = get_fallback_settings()
 
     path = model_path / 'config.json'
     if path.exists():
diff --git a/modules/shared.py b/modules/shared.py
index 16ccbe77..acb103b4 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -454,17 +454,7 @@ def load_user_config():
 
 args.loader = fix_loader_name(args.loader)
 
-# Load model-specific settings
-p = Path(f'{args.model_dir}/config.yaml')
-if p.exists():
-    model_config = yaml.safe_load(open(p, 'r').read())
-else:
-    model_config = {}
-del p
-
-
 # Load custom model-specific settings
 user_config = load_user_config()
 
-model_config = OrderedDict(model_config)
 user_config = OrderedDict(user_config)
diff --git a/server.py b/server.py
index d224909c..88936ca6 100644
--- a/server.py
+++ b/server.py
@@ -18,7 +18,6 @@ import modules.extensions as extensions_module
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model_if_idle
 from modules.models_settings import (
-    get_fallback_settings,
     get_model_metadata,
     update_model_parameters
 )
@@ -271,10 +270,6 @@ if __name__ == "__main__":
     # Apply CLI overrides for image model settings (CLI flags take precedence over saved settings)
     shared.apply_image_model_cli_overrides()
 
-    # Fallback settings for models
-    shared.model_config['.*'] = get_fallback_settings()
-    shared.model_config.move_to_end('.*', last=False)  # Move to the beginning
-
     # Activate the extensions listed on settings.yaml
     extensions_module.available_extensions = utils.get_available_extensions()
     for extension in shared.settings['default_extensions']:
diff --git a/user_data/models/config.yaml b/user_data/models/config.yaml
deleted file mode 100644
index 038ebcf1..00000000
--- a/user_data/models/config.yaml
+++ /dev/null
@@ -1,203 +0,0 @@
-.*(llama|alpac|vicuna|guanaco|koala|llava|wizardlm|metharme|pygmalion-7b|pygmalion-2|mythalion|wizard-mega|openbuddy|vigogne|h2ogpt-research|manticore):
-  model_type: 'llama'
-.*(opt-|opt_|opt1|opt3|optfor|galactica|galpaca|pygmalion-350m):
-  model_type: 'opt'
-.*(gpt-j|gptj|gpt4all-j|malion-6b|pygway|pygmalion-6b|dolly-v1):
-  model_type: 'gptj'
-.*(gpt-neox|koalpaca-polyglot|polyglot.*koalpaca|polyglot-ko|polyglot_ko|pythia|stablelm|incite|dolly-v2|polycoder|h2ogpt-oig|h2ogpt-oasst1|h2ogpt-gm):
-  model_type: 'gptneox'
-.*bloom:
-  model_type: 'bloom'
-.*gpt2:
-  model_type: 'gpt2'
-.*falcon:
-  model_type: 'falcon'
-.*mpt:
-  model_type: 'mpt'
-.*(starcoder|starchat):
-  model_type: 'starcoder'
-.*dolly-v2:
-  model_type: 'dollyv2'
-.*replit:
-  model_type: 'replit'
-.*(oasst|openassistant-|stablelm-7b-sft-v7-epoch-3):
-  instruction_template: 'Open Assistant'
-  skip_special_tokens: false
-(?!.*galactica)(?!.*reward).*openassistant:
-  instruction_template: 'Open Assistant'
-  skip_special_tokens: false
-.*galactica:
-  skip_special_tokens: false
-.*dolly-v[0-9]-[0-9]*b:
-  instruction_template: 'Alpaca'
-  skip_special_tokens: false
-.*alpaca-native-4bit:
-  instruction_template: 'Alpaca'
-.*llava:
-  instruction_template: 'LLaVA'
-.*llava.*1.5:
-  instruction_template: 'Vicuna-v1.1'
-.*wizard.*mega:
-  instruction_template: 'Wizard-Mega'
-.*starchat-beta:
-  instruction_template: 'Starchat-Beta'
-(?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna:
-  instruction_template: 'Vicuna-v0'
-.*vicuna.*v0:
-  instruction_template: 'Vicuna-v0'
-.*vicuna.*(1.1|1_1|1.3|1_3):
-  instruction_template: 'Vicuna-v1.1'
-.*vicuna.*(1.5|1_5):
-  instruction_template: 'Vicuna-v1.1'
-.*stable.*vicuna:
-  instruction_template: 'StableVicuna'
-(?!.*chat).*chinese-vicuna:
-  instruction_template: 'Alpaca'
-.*chinese-vicuna.*chat:
-  instruction_template: 'Chinese-Vicuna-Chat'
-.*alpaca:
-  instruction_template: 'Alpaca'
-.*koala:
-  instruction_template: 'Koala'
-.*chatglm:
-  instruction_template: 'ChatGLM'
-.*(metharme|pygmalion|mythalion):
-  instruction_template: 'Metharme'
-.*raven:
-  instruction_template: 'RWKV-Raven'
-.*moss-moon.*sft:
-  instruction_template: 'MOSS'
-.*stablelm-tuned:
-  instruction_template: 'StableLM'
-.*galactica.*finetuned:
-  instruction_template: 'Galactica Finetuned'
-.*galactica.*-v2:
-  instruction_template: 'Galactica v2'
-(?!.*finetuned)(?!.*-v2).*galactica:
-  instruction_template: 'Galactica'
-.*guanaco:
-  instruction_template: 'Guanaco non-chat'
-.*baize:
-  instruction_template: 'Baize'
-.*mpt-.*instruct:
-  instruction_template: 'Alpaca'
-.*mpt-.*chat:
-  instruction_template: 'ChatML'
-(?!.*-flan-)(?!.*-t5-).*lamini-:
-  instruction_template: 'Alpaca'
-.*incite.*chat:
-  instruction_template: 'INCITE-Chat'
-.*incite.*instruct:
-  instruction_template: 'INCITE-Instruct'
-.*ziya-:
-  instruction_template: 'Ziya'
-.*koalpaca:
-  instruction_template: 'KoAlpaca'
-.*openbuddy:
-  instruction_template: 'OpenBuddy'
-(?!.*chat).*vigogne:
-  instruction_template: 'Vigogne-Instruct'
-.*vigogne.*chat:
-  instruction_template: 'Vigogne-Chat'
-.*(llama-deus|supercot|llama-natural-instructions|open-llama-0.3t-7b-instruct-dolly-hhrlhf|open-llama-0.3t-7b-open-instruct):
-  instruction_template: 'Alpaca'
-.*bactrian:
-  instruction_template: 'Bactrian'
-.*(h2ogpt-oig-|h2ogpt-oasst1-|h2ogpt-research-oasst1-):
-  instruction_template: 'INCITE-Chat'
-.*h2ogpt-gm-:
-  instruction_template: 'H2O-prompt_answer'
-.*manticore:
-  instruction_template: 'Manticore Chat'
-.*bluemoonrp-(30|13)b:
-  instruction_template: 'Bluemoon'
-.*Nous-Hermes-13b:
-  instruction_template: 'Alpaca'
-.*airoboros:
-  instruction_template: 'Vicuna-v1.1'
-.*airoboros.*1.2:
-  instruction_template: 'Airoboros-v1.2'
-.*alpa(cino|sta):
-  instruction_template: 'Alpaca'
-.*hippogriff:
-  instruction_template: 'Hippogriff'
-.*lazarus:
-  instruction_template: 'Alpaca'
-.*guanaco-.*(7|13|33|65)b:
-  instruction_template: 'Vicuna-v0'
-.*hypermantis:
-  instruction_template: 'Alpaca'
-.*open-llama-.*-open-instruct:
-  instruction_template: 'Alpaca'
-.*starcoder-gpteacher-code-instruct:
-  instruction_template: 'Alpaca'
-.*tulu:
-  instruction_template: 'Tulu'
-.*chronos:
-  instruction_template: 'Alpaca'
-.*samantha:
-  instruction_template: 'Samantha'
-.*wizardcoder:
-  instruction_template: 'Alpaca'
-.*minotaur:
-  instruction_template: 'Manticore Chat'
-.*orca_mini:
-  instruction_template: 'Orca Mini'
-.*(platypus|gplatty|superplatty):
-  instruction_template: 'Alpaca'
-.*(openorca-platypus2):
-  instruction_template: 'OpenOrca-Platypus2'
-.*longchat:
-  instruction_template: 'Vicuna-v1.1'
-.*vicuna-33b:
-  instruction_template: 'Vicuna-v1.1'
-.*redmond-hermes-coder:
-  instruction_template: 'Alpaca'
-.*wizardcoder-15b:
-  instruction_template: 'Alpaca'
-.*wizardlm:
-  instruction_template: 'Vicuna-v1.1'
-.*godzilla:
-  instruction_template: 'Alpaca'
-.*llama(-?)(2|v2).*chat:
-  instruction_template: 'Llama-v2'
-.*newhope:
-  instruction_template: 'NewHope'
-.*stablebeluga2:
-  instruction_template: 'StableBeluga2'
-.*openchat:
-  instruction_template: 'OpenChat'
-.*codellama.*instruct:
-  instruction_template: 'Llama-v2'
-.*(mistral|mixtral).*instruct:
-  instruction_template: 'Mistral'
-.*mistral.*openorca:
-  instruction_template: 'ChatML'
-.*(WizardCoder-Python-34B-V1.0|Phind-CodeLlama-34B-v2|CodeBooga-34B-v0.1):
-  instruction_template: 'Alpaca'
-.*orca-2-(13|7)b:
-  instruction_template: 'ChatML'
-.*openhermes.*mistral:
-  instruction_template: 'ChatML'
-.*Yi-34B-Chat:
-  instruction_template: 'ChatML'
-(dolphin).*:
-  instruction_template: 'ChatML'
-.*synthia:
-  instruction_template: 'Synthia'
-.*(hercules|hyperion):
-  instruction_template: 'ChatML'
-.*command-r:
-  instruction_template: 'Command-R'
-.*xwin-lm-70b-v0.1:
-  instruction_template: 'Vicuna-v1.1'
-.*platypus-yi-34b:
-  instruction_template: 'Vicuna-v1.1'
-.*CausalLM-RP-34B:
-  instruction_template: 'ChatML'
-34b-beta:
-  instruction_template: 'ChatML'
-.*airoboros-3_1-yi-34b-200k:
-  instruction_template: 'Llama-v2'
-.*chatqa:
-  instruction_template: 'NVIDIA-ChatQA'

From d6f1485dd189494f6fbe5b6ea7ebd5cc0404233a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 21:45:11 -0700
Subject: [PATCH 02/27] UI: Update the enable_thinking info message

---
 modules/ui_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index f1dc7883..10d05f65 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -82,7 +82,7 @@ def create_ui():
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
                 shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
-                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.')
+                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='For models with thinking support.')
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 

From 368f37335f634ba001d00d2841902de85c7b48db Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 25 Mar 2026 06:37:45 -0700
Subject: [PATCH 03/27] Fix --idle-timeout issues with encode/decode and
 parallel generation

---
 modules/logits.py          |  4 +---
 modules/models.py          | 15 ++++++++++++++-
 modules/text_generation.py | 18 +++++++++++++-----
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/modules/logits.py b/modules/logits.py
index 1f878f27..473f5890 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -4,7 +4,6 @@ import numpy as np
 
 from modules import models, shared
 from modules.logging_colors import logger
-from modules.models import load_model
 from modules.text_generation import generate_reply
 from modules.utils import check_model_loaded
 
@@ -12,8 +11,7 @@ global_scores = None
 
 
 def get_next_logits(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+    models.load_model_if_idle_unloaded()
 
     needs_lock = not args[2]  # use_samplers
     if needs_lock:
diff --git a/modules/models.py b/modules/models.py
index b2665c6b..61ca3838 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,4 +1,5 @@
 import sys
+import threading
 import time
 
 import modules.shared as shared
@@ -7,6 +8,15 @@ from modules.models_settings import get_model_metadata
 from modules.utils import resolve_model_path
 
 last_generation_time = time.time()
+active_generation_count = 0
+_generation_count_lock = threading.Lock()
+
+
+def load_model_if_idle_unloaded():
+    global last_generation_time
+    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
+        shared.model, shared.tokenizer = load_model(shared.model_name)
+        last_generation_time = time.time()
 
 
 def load_model(model_name, loader=None):
@@ -158,7 +168,10 @@ def unload_model_if_idle():
     while True:
         shared.generation_lock.acquire()
         try:
-            if time.time() - last_generation_time > shared.args.idle_timeout * 60:
+            with _generation_count_lock:
+                is_active = active_generation_count > 0
+
+            if not is_active and time.time() - last_generation_time > shared.args.idle_timeout * 60:
                 if shared.model is not None:
                     logger.info("Unloading the model for inactivity.")
                     unload_model(keep_model_name=True)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f77be124..3a9ddab5 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -17,9 +17,7 @@ from modules.utils import check_model_loaded
 
 
 def generate_reply(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
-        from modules.models import load_model
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+    models.load_model_if_idle_unloaded()
 
     state = args[1] if len(args) > 1 else kwargs.get('state', {})
     use_parallel = (
@@ -31,10 +29,16 @@ def generate_reply(*args, **kwargs):
     if not use_parallel:
         shared.generation_lock.acquire()
 
+    with models._generation_count_lock:
+        models.active_generation_count += 1
+
     try:
         for result in _generate_reply(*args, **kwargs):
             yield result
     finally:
+        with models._generation_count_lock:
+            models.active_generation_count -= 1
+
         models.last_generation_time = time.time()
         if not use_parallel:
             shared.generation_lock.release()
@@ -126,7 +130,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
 
 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
     if shared.tokenizer is None:
-        raise ValueError('No tokenizer is loaded')
+        models.load_model_if_idle_unloaded()
+        if shared.tokenizer is None:
+            raise ValueError('No tokenizer is loaded')
 
     # llama.cpp case
     if shared.model.__class__.__name__ == 'LlamaServer':
@@ -176,7 +182,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
 
 def decode(output_ids, skip_special_tokens=True):
     if shared.tokenizer is None:
-        raise ValueError('No tokenizer is loaded')
+        models.load_model_if_idle_unloaded()
+        if shared.tokenizer is None:
+            raise ValueError('No tokenizer is loaded')
 
     return shared.tokenizer.decode(output_ids, skip_special_tokens=skip_special_tokens)
 

From e1541400219043f9b9cebf5f002b48251efc8bf9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 25 Mar 2026 07:21:02 -0700
Subject: [PATCH 04/27] Rename "truncation length" to "context length" in logs

---
 modules/api/models.py | 2 +-
 modules/models.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/api/models.py b/modules/api/models.py
index c879a860..b89397d3 100644
--- a/modules/api/models.py
+++ b/modules/api/models.py
@@ -68,7 +68,7 @@ def _load_model(data):
             if k in shared.settings:
                 shared.settings[k] = settings[k]
                 if k == 'truncation_length':
-                    logger.info(f"TRUNCATION LENGTH (UPDATED): {shared.settings['truncation_length']}")
+                    logger.info(f"CONTEXT LENGTH (UPDATED): {shared.settings['truncation_length']}")
                 elif k == 'instruction_template':
                     logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}")
 
diff --git a/modules/models.py b/modules/models.py
index 61ca3838..e997d2d8 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -76,7 +76,7 @@ def load_model(model_name, loader=None):
 
     logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
     logger.info(f"LOADER: \"{loader}\"")
-    logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
+    logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}")
     return model, tokenizer
 
 

From 4cbea02ed4e0dee2efd066ac48bcdf33631b9eca Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 26 Mar 2026 06:49:39 -0700
Subject: [PATCH 05/27] Add ik_llama.cpp support via `--ik` flag

---
 modules/llama_cpp_server.py | 37 +++++++++++++++++++++++++++++++++++++
 modules/shared.py           |  1 +
 2 files changed, 38 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2ae01ddc..9b9756a9 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -470,6 +470,10 @@ class LlamaServer:
                         else:
                             cmd.append(f"--{flag_item}")
 
+        # Patch flags for ik_llama.cpp compatibility
+        if shared.args.ik:
+            cmd = _patch_cmd_for_ik(cmd)
+
         env = os.environ.copy()
         if os.name == 'posix':
             current_path = env.get('LD_LIBRARY_PATH', '')
@@ -607,3 +611,36 @@ def filter_stderr_with_progress(process_stderr):
             process_stderr.close()
         except Exception:
             pass
+
+
+def _patch_cmd_for_ik(cmd):
+    """
+    Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
+      --no-webui          → --webui none
+      --fit off            → (removed)
+      --fit on / --fit-ctx → --fit (bare flag)
+      --fit-target         → --fit-margin
+    """
+    patched = []
+    i = 0
+    while i < len(cmd):
+        arg = cmd[i]
+
+        if arg == "--no-webui":
+            patched += ["--webui", "none"]
+        elif arg == "--fit" and i + 1 < len(cmd) and cmd[i + 1] in ("on", "off"):
+            val = cmd[i + 1]
+            i += 1
+            if val == "on":
+                patched.append("--fit")
+            # "off" → drop entirely
+        elif arg == "--fit-ctx":
+            i += 1  # skip the value
+        elif arg == "--fit-target":
+            patched.append("--fit-margin")
+        else:
+            patched.append(arg)
+
+        i += 1
+
+    return patched
diff --git a/modules/shared.py b/modules/shared.py
index acb103b4..c50736d7 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -110,6 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
 group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside <venv>/lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')

From bda95172bd6abecba165fc118f140cfc446f3c42 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 06:09:53 -0700
Subject: [PATCH 06/27] Fix stopping string detection for chromadb/context-1

---
 modules/chat.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index f8088e0f..edda11b0 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -671,7 +671,10 @@ def get_stopping_strings(state):
     # Handle GPT-OSS as a special case
     if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
         result.remove("<|end|>")
-        result.append("<|result|>")
+        if '<|result|>' in state['instruction_template_str']:
+            result.append("<|result|>")
+        elif '<|return|>' in state['instruction_template_str']:
+            result.append("<|return|>")
         result = list(set(result))
 
     if shared.args.verbose:

From 9dd04b86ce407507bcaf0862b97aadc64b6e62a6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 06:17:57 -0700
Subject: [PATCH 07/27] Suppress EOS token at logit level for ExLlamav3 when
 ban_eos_token is set

---
 modules/exllamav3.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 75c76c7c..f873503a 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -423,6 +423,15 @@ class Exllamav3Model:
         if logit_bias:
             filters.append(LogitBiasFilter(self.tokenizer, logit_bias))
 
+        # Suppress EOS tokens via logit bias so they are never sampled
+        if state['ban_eos_token']:
+            eos_bias = {}
+            for eos_id in self.config.eos_token_id_list:
+                if eos_id is not None:
+                    eos_bias[str(eos_id)] = float('-inf')
+            if eos_bias:
+                filters.append(LogitBiasFilter(self.tokenizer, eos_bias))
+
         # Logprobs support (OpenAI API)
         logprobs = state.get('logprobs', 0) or 0
         return_top_tokens = logprobs if logprobs > 0 else 0

From 4979e87e48c78d5e3186e4d9b2fbc8b30e86164f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 11:49:47 -0300
Subject: [PATCH 08/27] Add ik_llama.cpp support via ik_llama_cpp_binaries
 package

---
 .github/workflows/build-everything-tgw.yml    |  35 +++
 .../build-portable-release-ik-cuda.yml        | 179 +++++++++++++++
 .../workflows/build-portable-release-ik.yml   | 205 ++++++++++++++++++
 modules/llama_cpp_server.py                   |  21 +-
 modules/loaders.py                            |   2 +
 modules/shared.py                             |   2 +-
 modules/ui_model_menu.py                      |   3 +
 requirements/full/requirements.txt            |   6 +-
 requirements/full/requirements_amd.txt        |   4 +-
 .../full/requirements_apple_intel.txt         |   3 +-
 .../full/requirements_apple_silicon.txt       |   3 +-
 requirements/full/requirements_cpu_only.txt   |   6 +-
 requirements/portable/requirements.txt        |   4 +-
 requirements/portable/requirements_amd.txt    |   4 +-
 .../portable/requirements_apple_intel.txt     |   2 +-
 .../portable/requirements_apple_silicon.txt   |   2 +-
 .../portable/requirements_cpu_only.txt        |   4 +-
 .../portable/requirements_cuda131.txt         |   4 +-
 requirements/portable/requirements_vulkan.txt |   4 +-
 19 files changed, 469 insertions(+), 24 deletions(-)
 create mode 100644 .github/workflows/build-portable-release-ik-cuda.yml
 create mode 100644 .github/workflows/build-portable-release-ik.yml

diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 9322f859..4de591f4 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -68,3 +68,38 @@ jobs:
     with:
       version: ${{ inputs.version }}
       config: 'os:macos-15-intel,macos-14'
+
+  build_release_ik_cuda_windows:
+    name: ik CUDA Windows
+    uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
+  build_release_ik_cuda_linux:
+    name: ik CUDA Linux
+    uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
+
+  build_release_ik_cpu_windows:
+    name: ik CPU Windows
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
+  build_release_ik_cpu_linux:
+    name: ik CPU Linux
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
+
+  build_release_ik_macos:
+    name: ik macOS
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:macos-14'
diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml
new file mode 100644
index 00000000..40b4b92f
--- /dev/null
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@@ -0,0 +1,179 @@
+name: Build ik CUDA
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04', 'windows-2022')
+              'pyver' = @("3.13")
+              'cuda' = @("12.4", "13.1")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }} CUDA ${{ matrix.cuda }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            CUDA_VERSION="${{ matrix.cuda }}"
+            VERSION="${{ inputs.version }}"
+
+            # 1. Set platform-specific variables
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                PLATFORM="windows"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/python.exe -m pip"
+                PACKAGES_PATH="portable_env/Lib/site-packages"
+                rm start_linux.sh start_macos.sh
+            else
+                PLATFORM="linux"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_macos.sh start_windows.bat
+            fi
+
+            # 2. Download and extract Python
+            cd ..
+            echo "Downloading Python for $PLATFORM..."
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file based on CUDA version
+            cd "text-generation-webui-${VERSION_CLEAN}"
+            if [[ "$CUDA_VERSION" == "13.1" ]]; then
+                REQ_FILE="requirements/portable/requirements_cuda131.txt"
+            else
+                REQ_FILE="requirements/portable/requirements.txt"
+            fi
+
+            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
+            sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true
+
+            # 5. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 6. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 7. Create archive
+            cd ..
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+            else
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+            fi
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-ik-*
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml
new file mode 100644
index 00000000..afb2e763
--- /dev/null
+++ b/.github/workflows/build-portable-release-ik.yml
@@ -0,0 +1,205 @@
+name: Build ik CPU and macOS
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
+              'pyver' = @("3.13")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            VERSION="${{ inputs.version }}"
+            OS_TYPE="${{ matrix.os }}"
+
+            # 1. Set platform-specific variables
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                PLATFORM="windows-cpu"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/python.exe -m pip"
+                PACKAGES_PATH="portable_env/Lib/site-packages"
+                rm start_linux.sh start_macos.sh
+            elif [[ "$RUNNER_OS" == "macOS" ]]; then
+                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
+                    PLATFORM="macos-x86_64"
+                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz"
+                    REQ_TYPE="apple_intel"
+                else
+                    PLATFORM="macos-arm64"
+                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz"
+                    REQ_TYPE="apple_silicon"
+                fi
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_linux.sh start_windows.bat
+            else
+                # Linux case
+                PLATFORM="linux-cpu"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_macos.sh start_windows.bat
+            fi
+
+            # 2. Download and extract Python
+            echo "Downloading Python for $PLATFORM..."
+            cd ..
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file based on platform
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Select requirements file based on platform
+            if [[ "$RUNNER_OS" == "macOS" ]]; then
+                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
+                    REQ_FILE="requirements/portable/requirements_apple_intel.txt"
+                else
+                    REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
+                fi
+            else
+                REQ_FILE="requirements/portable/requirements_cpu_only.txt"
+            fi
+
+            echo "Using requirements file: $REQ_FILE"
+
+            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
+            if [[ "$RUNNER_OS" == "macOS" ]]; then
+                sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+                sed -i '' 's/--portable/--portable --ik/g' start_macos.sh
+            else
+                sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+                sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
+            fi
+
+            # 5. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 6. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 7. Create archive
+            cd ..
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+            else
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+            fi
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-ik-*
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 9b9756a9..5e2decfa 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -11,7 +11,6 @@ import time
 from pathlib import Path
 from typing import Any, List
 
-import llama_cpp_binaries
 import requests
 
 from modules import shared
@@ -357,7 +356,16 @@ class LlamaServer:
         """Start the llama.cpp server and wait until it's ready."""
         # Determine the server path
         if self.server_path is None:
-            self.server_path = llama_cpp_binaries.get_binary_path()
+            if shared.args.ik:
+                try:
+                    import ik_llama_cpp_binaries
+                except ImportError:
+                    raise ImportError("--ik requires the ik_llama_cpp_binaries package. Install it with: pip install <ik_llama_cpp_binaries wheel URL>")
+
+                self.server_path = ik_llama_cpp_binaries.get_binary_path()
+            else:
+                import llama_cpp_binaries
+                self.server_path = llama_cpp_binaries.get_binary_path()
 
         # Build the command
         cmd = [
@@ -616,10 +624,12 @@ def filter_stderr_with_progress(process_stderr):
 def _patch_cmd_for_ik(cmd):
     """
     Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
-      --no-webui          → --webui none
+      --no-webui           → --webui none
       --fit off            → (removed)
       --fit on / --fit-ctx → --fit (bare flag)
       --fit-target         → --fit-margin
+      --cache-reuse        → (removed, unsupported)
+      --swa-full           → (removed, unsupported)
     """
     patched = []
     i = 0
@@ -635,9 +645,14 @@ def _patch_cmd_for_ik(cmd):
                 patched.append("--fit")
             # "off" → drop entirely
         elif arg == "--fit-ctx":
+            patched.append("--fit")
             i += 1  # skip the value
         elif arg == "--fit-target":
             patched.append("--fit-margin")
+        elif arg == "--cache-reuse":
+            i += 1  # skip the value
+        elif arg == "--swa-full":
+            pass  # bare flag, just drop it
         else:
             patched.append(arg)
 
diff --git a/modules/loaders.py b/modules/loaders.py
index c90f2ebb..cb1f3d3b 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -20,6 +20,7 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'mlock',
         'numa',
+        'ik',
         'parallel',
         'model_draft',
         'draft_max',
@@ -345,6 +346,7 @@ def list_model_elements():
         'spec_ngram_size_m',
         'spec_ngram_min_hits',
         'mmproj',
+        'ik',
     ]
 
 
diff --git a/modules/shared.py b/modules/shared.py
index c50736d7..13843f0c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -110,7 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
 group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
-group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside <venv>/lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. Requires the ik_llama_cpp_binaries package to be installed.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 5b7621a7..16505afa 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -51,6 +51,9 @@ def create_ui():
 
                         with gr.Column():
                             shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
+                            if not shared.args.portable:
+                                shared.gradio['ik'] = gr.Checkbox(label="ik", value=shared.args.ik, info='Use ik_llama.cpp instead of upstream llama.cpp.')
+
                             shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 56619627..100c99d1 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 620683cc..66fa4ac7 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b1f109b2..98dc8be6 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index a54476a9..e33264cf 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index be82c904..cd083f6d 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,5 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 188da380..67182225 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 4562b6d0..5f5b2f8d 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 04dcf25e..f5f7d6ee 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 4b8af78a..e51fc296 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 5b0eaf89..683f94c8 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 90b3234f..942d0877 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index ea72b4ec..ae784e00 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From be6fc0663ac1b7a60b7fde24afb38de2b0aba57b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 08:11:28 -0700
Subject: [PATCH 09/27] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 14 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 100c99d1..6e11dd2f 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 66fa4ac7..c964eff6 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 98dc8be6..b1dd6a4f 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index e33264cf..4d03d280 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index cd083f6d..9d41d069 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 77c254e6..052085cc 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 67182225..ff80b6c8 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 5f5b2f8d..318044da 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index f5f7d6ee..1676bffb 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index e51fc296..27fc2da8 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 683f94c8..0bbdd30a 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 942d0877..c3ae3c57 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index e8457909..e38140ce 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index ae784e00..e646c04c 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 0466b6e2714a05c04eff0c929f15e4679f029e8d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 29 Mar 2026 15:52:36 -0700
Subject: [PATCH 10/27] ik_llama.cpp: Auto-enable Hadamard KV cache rotation
 with quantized cache

---
 modules/llama_cpp_server.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 5e2decfa..fa968be1 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -631,6 +631,12 @@ def _patch_cmd_for_ik(cmd):
       --cache-reuse        → (removed, unsupported)
       --swa-full           → (removed, unsupported)
     """
+    # Add Hadamard KV cache rotation when using quantized cache types.
+    # This significantly improves quantized cache quality (especially q4_0)
+    # and is a no-op for MLA models like DeepSeek.
+    if shared.args.cache_type in ("q8_0", "q4_0"):
+        cmd += ["-khad", "-vhad"]
+
     patched = []
     i = 0
     while i < len(cmd):

From 6382fbef8381bf60ff909b4fd76e7c1f4c063afc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 30 Mar 2026 17:44:19 -0700
Subject: [PATCH 11/27] Several small code simplifications

---
 download-model.py        |  25 +++---
 js/dark_theme.js         |  12 ++-
 js/global_scope_js.js    |  79 +++++++++---------
 js/main.js               | 171 +++++++++++++--------------------------
 js/save_files.js         |  18 ++---
 js/show_controls.js      |  21 ++---
 js/switch_tabs.js        |  24 ++----
 js/update_big_picture.js |   3 +-
 modules/extensions.py    |  22 +++--
 9 files changed, 140 insertions(+), 235 deletions(-)

diff --git a/download-model.py b/download-model.py
index 95d25e16..a31bbfc6 100644
--- a/download-model.py
+++ b/download-model.py
@@ -158,28 +158,21 @@ class ModelDownloader:
         # Also if GGUF and safetensors are available, download only safetensors
         if (has_pytorch or has_pt or has_gguf) and has_safetensors:
             has_gguf = False
-            for i in range(len(classifications) - 1, -1, -1):
-                if classifications[i] in ['pytorch', 'pt', 'gguf']:
-                    links.pop(i)
-                    file_sizes.pop(i)
+            keep = [i for i, c in enumerate(classifications) if c not in ['pytorch', 'pt', 'gguf']]
+            links = [links[i] for i in keep]
+            file_sizes = [file_sizes[i] for i in keep]
 
         # For GGUF, try to download only the Q4_K_M if no specific file is specified.
         if has_gguf and specific_file is None:
-            has_q4km = False
-            for i in range(len(classifications) - 1, -1, -1):
-                if 'q4_k_m' in links[i].lower():
-                    has_q4km = True
+            has_q4km = any('q4_k_m' in link.lower() for link in links)
 
             if has_q4km:
-                for i in range(len(classifications) - 1, -1, -1):
-                    if 'q4_k_m' not in links[i].lower():
-                        links.pop(i)
-                        file_sizes.pop(i)
+                keep = [i for i, link in enumerate(links) if 'q4_k_m' in link.lower()]
             else:
-                for i in range(len(classifications) - 1, -1, -1):
-                    if links[i].lower().endswith('.gguf'):
-                        links.pop(i)
-                        file_sizes.pop(i)
+                keep = [i for i, link in enumerate(links) if not link.lower().endswith('.gguf')]
+
+            links = [links[i] for i in keep]
+            file_sizes = [file_sizes[i] for i in keep]
 
         is_llamacpp = has_gguf and specific_file is not None
         return links, sha256, is_lora, is_llamacpp, file_sizes
diff --git a/js/dark_theme.js b/js/dark_theme.js
index 7136f5bf..9d7069e2 100644
--- a/js/dark_theme.js
+++ b/js/dark_theme.js
@@ -1,6 +1,6 @@
 function toggleDarkMode() {
   document.body.classList.toggle("dark");
-  var currentCSS = document.getElementById("highlight-css");
+  const currentCSS = document.getElementById("highlight-css");
   if (currentCSS.getAttribute("href") === "file/css/highlightjs/github-dark.min.css") {
     currentCSS.setAttribute("href", "file/css/highlightjs/github.min.css");
   } else {
@@ -9,12 +9,10 @@ function toggleDarkMode() {
 
   // Re-highlight all code blocks once stylesheet loads
   currentCSS.onload = function() {
-    const messageBodies = document.getElementById("chat").querySelectorAll(".message-body");
-    messageBodies.forEach((messageBody) => {
-      const codeBlocks = messageBody.querySelectorAll("pre code");
-      codeBlocks.forEach((codeBlock) => {
-        hljs.highlightElement(codeBlock);
-      });
+    // Clear data-highlighted so hljs will re-process with the new theme
+    document.querySelectorAll("#chat .message-body pre code[data-highlighted]").forEach((codeBlock) => {
+      delete codeBlock.dataset.highlighted;
     });
+    doSyntaxHighlighting();
   };
 }
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 92f65622..20eeef66 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -1,11 +1,35 @@
+// -------------------------------------------------
+// Shared helpers
+// -------------------------------------------------
+
+function getProfilePictureUrl() {
+  return "/file/user_data/cache/pfp_character.png?time=" + Date.now();
+}
+
+const MESSAGE_SELECTOR = ".message, .user-message, .assistant-message";
+
+function getMessageElement(element) {
+  if (!element) return null;
+  return element.closest(MESSAGE_SELECTOR);
+}
+
+function isUserRole(messageElement) {
+  return messageElement.classList.contains("user-message") ||
+         messageElement.querySelector(".text-you") !== null ||
+         messageElement.querySelector(".circle-you") !== null;
+}
+
+// Trigger a synthetic 'input' event so Gradio picks up programmatic value changes
+function dispatchGradioInput(element) {
+  element.dispatchEvent(new Event("input", { bubbles: true }));
+}
+
 // -------------------------------------------------
 // Event handlers
 // -------------------------------------------------
 
 function copyToClipboard(element) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
   if (!messageElement) return;
 
   const rawText = messageElement.getAttribute("data-raw");
@@ -48,9 +72,7 @@ function fallbackCopyToClipboard(text) {
 }
 
 function branchHere(element) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
   if (!messageElement) return;
 
   const index = messageElement.getAttribute("data-index");
@@ -69,11 +91,7 @@ function branchHere(element) {
   }
 
   branchIndexInput.value = index;
-
-  // Trigger any 'change' or 'input' events Gradio might be listening for
-  const event = new Event("input", { bubbles: true });
-  branchIndexInput.dispatchEvent(event);
-
+  dispatchGradioInput(branchIndexInput);
   branchButton.click();
 }
 
@@ -82,9 +100,7 @@ function branchHere(element) {
 // -------------------------------------------------
 
 function editHere(buttonElement) {
-  if (!buttonElement) return;
-
-  const messageElement = buttonElement.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(buttonElement);
   if (!messageElement) return;
 
   const messageBody = messageElement.querySelector(".message-body");
@@ -97,12 +113,7 @@ function editHere(buttonElement) {
     return;
   }
 
-  // Determine role based on message element - handle different chat modes
-  const isUserMessage = messageElement.classList.contains("user-message") ||
-                       messageElement.querySelector(".text-you") !== null ||
-                       messageElement.querySelector(".circle-you") !== null;
-
-  startEditing(messageElement, messageBody, isUserMessage);
+  startEditing(messageElement, messageBody, isUserRole(messageElement));
 }
 
 function startEditing(messageElement, messageBody, isUserMessage) {
@@ -209,30 +220,22 @@ function submitMessageEdit(index, newText, isUserMessage) {
   editTextInput.value = newText;
   editRoleInput.value = isUserMessage ? "user" : "assistant";
 
-  editIndexInput.dispatchEvent(new Event("input", { bubbles: true }));
-  editTextInput.dispatchEvent(new Event("input", { bubbles: true }));
-  editRoleInput.dispatchEvent(new Event("input", { bubbles: true }));
+  dispatchGradioInput(editIndexInput);
+  dispatchGradioInput(editTextInput);
+  dispatchGradioInput(editRoleInput);
 
   editButton.click();
   return true;
 }
 
 function navigateVersion(element, direction) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
   if (!messageElement) return;
 
   const index = messageElement.getAttribute("data-index");
   if (!index) return;
 
-  // Determine role based on message element classes
-  let role = "assistant"; // Default role
-  if (messageElement.classList.contains("user-message") ||
-      messageElement.querySelector(".text-you") ||
-      messageElement.querySelector(".circle-you")) {
-    role = "user";
-  }
+  const role = isUserRole(messageElement) ? "user" : "assistant";
 
   const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
   const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
@@ -248,11 +251,9 @@ function navigateVersion(element, direction) {
   directionInput.value = direction;
   roleInput.value = role;
 
-  // Trigger 'input' events for Gradio to pick up changes
-  const event = new Event("input", { bubbles: true });
-  indexInput.dispatchEvent(event);
-  directionInput.dispatchEvent(event);
-  roleInput.dispatchEvent(event);
+  dispatchGradioInput(indexInput);
+  dispatchGradioInput(directionInput);
+  dispatchGradioInput(roleInput);
 
   navigateButton.click();
 }
@@ -313,7 +314,7 @@ function handleMorphdomUpdate(data) {
 
 function applyMorphdomUpdate(data) {
   // Determine target element and use it as query scope
-  var target_element, target_html;
+  let target_element, target_html;
   if (data.last_message_only) {
     const childNodes = document.getElementsByClassName("messages")[0].childNodes;
     target_element = childNodes[childNodes.length - 1];
diff --git a/js/main.js b/js/main.js
index f05f93c6..cba4c903 100644
--- a/js/main.js
+++ b/js/main.js
@@ -4,8 +4,9 @@
 
 // Sync highlight.js theme with the actual Gradio theme
 var defined_hljs_css = document.body.classList.contains("dark") ? "file/css/highlightjs/github-dark.min.css" : "file/css/highlightjs/github.min.css";
-if (document.getElementById("highlight-css").getAttribute("href") !== defined_hljs_css) {
-  document.getElementById("highlight-css").setAttribute("href", defined_hljs_css);
+var hljsCssElement = document.getElementById("highlight-css");
+if (hljsCssElement.getAttribute("href") !== defined_hljs_css) {
+  hljsCssElement.setAttribute("href", defined_hljs_css);
 }
 
 let main_parent = document.getElementById("chat-tab").parentNode;
@@ -49,21 +50,18 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
 //------------------------------------------------
 
 // --- Helper functions --- //
-function isModifiedKeyboardEvent() {
-  return (event instanceof KeyboardEvent &&
-    event.shiftKey ||
-    event.ctrlKey ||
-    event.altKey ||
-    event.metaKey);
+function isModifiedKeyboardEvent(event) {
+  return event instanceof KeyboardEvent &&
+    (event.shiftKey || event.ctrlKey || event.altKey || event.metaKey);
 }
 
-function isFocusedOnEditableTextbox() {
+function isFocusedOnEditableTextbox(event) {
   if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") {
     return !!event.target.value;
   }
+  return false;
 }
 
-let previousTabId = "chat-tab-button";
 document.addEventListener("keydown", function(event) {
   // Stop generation on Esc pressed
   if (event.key === "Escape") {
@@ -117,14 +115,14 @@ document.addEventListener("keydown", function(event) {
   }
 
   // --- Simple version navigation --- //
-  if (!isFocusedOnEditableTextbox()) {
+  if (!isFocusedOnEditableTextbox(event)) {
     // Version navigation on Arrow keys (horizontal)
-    if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") {
+    if (!isModifiedKeyboardEvent(event) && event.key === "ArrowLeft") {
       event.preventDefault();
       navigateLastAssistantMessage("left");
     }
 
-    else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") {
+    else if (!isModifiedKeyboardEvent(event) && event.key === "ArrowRight") {
       event.preventDefault();
       if (!navigateLastAssistantMessage("right")) {
         // If can't navigate right (last version), regenerate
@@ -159,9 +157,8 @@ targetElement.addEventListener("scroll", function() {
   let diff = targetElement.scrollHeight - targetElement.clientHeight;
   let isAtBottomNow = Math.abs(targetElement.scrollTop - diff) <= 10 || diff <= 0;
 
-  // Add scrolling class to disable hover effects
   if (window.isScrolled || !isAtBottomNow) {
-    targetElement.classList.add("scrolling");
+    targetElement.classList.add("scrolling"); // Disables hover effects during scroll
   }
 
   if(isAtBottomNow) {
@@ -202,12 +199,8 @@ const observer = new MutationObserver(function() {
 });
 
 // Only watch for attribute changes on targetElement (e.g. _generating class)
-const config = {
-  attributes: true
-};
-
 // Start observing the target element
-observer.observe(targetElement, config);
+observer.observe(targetElement, { attributes: true });
 
 //------------------------------------------------
 // Handle syntax highlighting / LaTeX
@@ -228,7 +221,7 @@ window.doSyntaxHighlighting = function() {
   if (messageBodies.length > 0) {
     let hasSeenVisible = false;
 
-    // Go from last message to first
+    // Go from last message to first so we can early-exit once past visible area
     for (let i = messageBodies.length - 1; i >= 0; i--) {
       const messageBody = messageBodies[i];
 
@@ -243,8 +236,8 @@ window.doSyntaxHighlighting = function() {
           codeBlock.classList.add("pretty_scrollbar");
         });
 
-        // Only render math in visible elements
         const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
+        // Only render math in individually visible containers (the outer check is on the message body)
         mathContainers.forEach(container => {
           if (isElementVisibleOnScreen(container)) {
             renderMathInElement(container, {
@@ -271,7 +264,7 @@ const doSyntaxHighlighting = window.doSyntaxHighlighting;
 // Add some scrollbars
 //------------------------------------------------
 const scrollbarElements = document.querySelectorAll(".add_scrollbar textarea, .add_scrollbar .drag-drop-list");
-for(i = 0; i < scrollbarElements.length; i++) {
+for(let i = 0; i < scrollbarElements.length; i++) {
   scrollbarElements[i].classList.remove("scroll-hide");
   scrollbarElements[i].classList.add("pretty_scrollbar");
   scrollbarElements[i].style.resize = "none";
@@ -298,13 +291,13 @@ if (toolsInfo) {
 // Remove some backgrounds
 //------------------------------------------------
 const noBackgroundelements = document.querySelectorAll(".no-background");
-for(i = 0; i < noBackgroundelements.length; i++) {
+for(let i = 0; i < noBackgroundelements.length; i++) {
   noBackgroundelements[i].parentNode.style.border = "none";
   noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = "center";
 }
 
 const slimDropdownElements = document.querySelectorAll(".slim-dropdown");
-for (i = 0; i < slimDropdownElements.length; i++) {
+for (let i = 0; i < slimDropdownElements.length; i++) {
   const parentNode = slimDropdownElements[i].parentNode;
   parentNode.style.background = "transparent";
   parentNode.style.border = "0";
@@ -374,49 +367,43 @@ button.addEventListener("click", function () {
   }
 });
 
-// Add event listener for mouseleave on the button
-button.addEventListener("mouseleave", function () {
-  // Delay to prevent menu hiding when the mouse leaves the button into the menu
+// Delay to prevent menu hiding when the mouse leaves the button or menu
+function delayedHideMenu() {
   setTimeout(function () {
     if (!isMouseOverButtonOrMenu()) {
       hideMenu();
     }
   }, 100);
-});
+}
 
+// Add event listener for mouseleave on the button
+button.addEventListener("mouseleave", delayedHideMenu);
 // Add event listener for mouseleave on the menu
-menu.addEventListener("mouseleave", function () {
-  // Delay to prevent menu hide when the mouse leaves the menu into the button
-  setTimeout(function () {
-    if (!isMouseOverButtonOrMenu()) {
-      hideMenu();
-    }
-  }, 100);
-});
+menu.addEventListener("mouseleave", delayedHideMenu);
 
 // Add event listener for click anywhere in the document
 document.addEventListener("click", function (event) {
-  const target = event.target;
-
   // Check if the click is outside the button/menu and the menu is visible
   if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") {
     hideMenu();
   }
 
-  if (event.target.classList.contains("pfp_character")) {
+  const target = event.target;
+
+  if (target.classList.contains("pfp_character")) {
     toggleBigPicture();
   }
 
   // Handle sidebar clicks on mobile
   if (isMobile()) {
-  // Check if the click did NOT originate from any of the specified toggle buttons or elements
+    // Check if the click did NOT originate from any of the specified toggle buttons or elements
     if (
       target.closest("#navigation-toggle") !== navigationToggle &&
-    target.closest("#past-chats-toggle") !== pastChatsToggle &&
-    target.closest("#chat-controls-toggle") !== chatControlsToggle &&
-    target.closest(".header_bar") !== headerBar &&
-    target.closest("#past-chats-row") !== pastChatsRow &&
-    target.closest("#chat-controls") !== chatControlsRow
+      target.closest("#past-chats-toggle") !== pastChatsToggle &&
+      target.closest("#chat-controls-toggle") !== chatControlsToggle &&
+      target.closest(".header_bar") !== headerBar &&
+      target.closest("#past-chats-row") !== pastChatsRow &&
+      target.closest("#chat-controls") !== chatControlsRow
     ) {
       handleIndividualSidebarClose(event);
     }
@@ -433,27 +420,19 @@ document.getElementById("chat-input-row").classList.add("chat-input-positioned")
 //------------------------------------------------
 const chatTextArea = document.getElementById("chat-input").querySelector("textarea");
 
-function respondToChatInputVisibility(element, callback) {
-  var options = {
-    root: document.documentElement,
-  };
-
-  var observer = new IntersectionObserver((entries, observer) => {
+function focusOnVisible(element) {
+  var observer = new IntersectionObserver((entries) => {
     entries.forEach(entry => {
-      callback(entry.intersectionRatio > 0);
+      if (entry.intersectionRatio > 0) {
+        element.focus();
+      }
     });
-  }, options);
+  }, { root: document.documentElement });
 
   observer.observe(element);
 }
 
-function handleChatInputVisibilityChange(isVisible) {
-  if (isVisible) {
-    chatTextArea.focus();
-  }
-}
-
-respondToChatInputVisibility(chatTextArea, handleChatInputVisibilityChange);
+focusOnVisible(chatTextArea);
 
 //------------------------------------------------
 // Show enlarged character picture when the profile
@@ -463,8 +442,7 @@ let bigPictureVisible = false;
 
 function addBigPicture() {
   var imgElement = document.createElement("img");
-  var timestamp = new Date().getTime();
-  imgElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
+  imgElement.src = getProfilePictureUrl();
   imgElement.classList.add("bigProfilePicture");
   imgElement.addEventListener("load", function () {
     this.style.visibility = "visible";
@@ -478,9 +456,8 @@ function addBigPicture() {
 }
 
 function deleteBigPicture() {
-  var bigProfilePictures = document.querySelectorAll(".bigProfilePicture");
-  bigProfilePictures.forEach(function (element) {
-    element.parentNode.removeChild(element);
+  document.querySelectorAll(".bigProfilePicture").forEach(function (element) {
+    element.remove();
   });
 }
 
@@ -494,44 +471,11 @@ function toggleBigPicture() {
   }
 }
 
-//------------------------------------------------
-// Handle the chat input box growth
-//------------------------------------------------
-
-// Cache DOM elements
-const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
-const chatInput = document.querySelector("#chat-input textarea");
-
-// Variables to store current dimensions
-let currentChatInputHeight = chatInput.clientHeight;
-
 //------------------------------------------------
 // Focus on the rename text area when it becomes visible
 //------------------------------------------------
 const renameTextArea = document.getElementById("rename-row").querySelector("textarea");
-
-function respondToRenameVisibility(element, callback) {
-  var options = {
-    root: document.documentElement,
-  };
-
-  var observer = new IntersectionObserver((entries, observer) => {
-    entries.forEach(entry => {
-      callback(entry.intersectionRatio > 0);
-    });
-  }, options);
-
-  observer.observe(element);
-}
-
-
-function handleVisibilityChange(isVisible) {
-  if (isVisible) {
-    renameTextArea.focus();
-  }
-}
-
-respondToRenameVisibility(renameTextArea, handleVisibilityChange);
+focusOnVisible(renameTextArea);
 
 //------------------------------------------------
 // Adjust the chat tab margin if no extension UI
@@ -737,21 +681,21 @@ function handleIndividualSidebarClose(event) {
 
   // Close navigation bar if click is outside and it is open
   if (!headerBar.contains(target) && !headerBar.classList.contains("sidebar-hidden")) {
-    toggleSidebar(headerBar, navigationToggle, true);
+    toggleSidebar(headerBar, navigationToggle);
   }
 
   // Close past chats row if click is outside and it is open
   if (!pastChatsRow.contains(target) && !pastChatsRow.classList.contains("sidebar-hidden")) {
-    toggleSidebar(pastChatsRow, pastChatsToggle, true);
+    toggleSidebar(pastChatsRow, pastChatsToggle);
   }
 
   // Close chat controls row if click is outside and it is open
   if (!chatControlsRow.contains(target) && !chatControlsRow.classList.contains("sidebar-hidden")) {
-    toggleSidebar(chatControlsRow, chatControlsToggle, true);
+    toggleSidebar(chatControlsRow, chatControlsToggle);
   }
 }
 
-function toggleSidebar(sidebar, toggle, forceClose = false) {
+function toggleSidebar(sidebar, toggle) {
   const isCurrentlyHidden = sidebar.classList.contains("sidebar-hidden");
   const shouldClose = !isCurrentlyHidden;
 
@@ -776,11 +720,6 @@ function toggleSidebar(sidebar, toggle, forceClose = false) {
     toggle.classList.toggle("chat-controls-open", !shouldClose);
     toggle.innerHTML = shouldClose ? leftArrowSVG : rightArrowSVG;
   }
-
-  // Mobile handling
-  if (isMobile()) {
-    sidebar.classList.toggle("sidebar-shown", !shouldClose);
-  }
 }
 
 // Function to check if the device is mobile
@@ -840,17 +779,17 @@ pastChatsToggle.addEventListener("click", () => {
   const isCurrentlyOpen = !pastChatsRow.classList.contains("sidebar-hidden");
   toggleSidebar(pastChatsRow, pastChatsToggle);
 
-  // On desktop, open/close both sidebars at the same time
+  // On desktop, sync both sidebars together
   if (!isMobile()) {
     if (isCurrentlyOpen) {
       // If we just closed the left sidebar, also close the right sidebar
       if (!chatControlsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(chatControlsRow, chatControlsToggle, true);
+        toggleSidebar(chatControlsRow, chatControlsToggle);
       }
     } else {
       // If we just opened the left sidebar, also open the right sidebar
       if (chatControlsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(chatControlsRow, chatControlsToggle, false);
+        toggleSidebar(chatControlsRow, chatControlsToggle);
       }
     }
   }
@@ -860,17 +799,17 @@ chatControlsToggle.addEventListener("click", () => {
   const isCurrentlyOpen = !chatControlsRow.classList.contains("sidebar-hidden");
   toggleSidebar(chatControlsRow, chatControlsToggle);
 
-  // On desktop, open/close both sidebars at the same time
+  // On desktop, sync both sidebars together
   if (!isMobile()) {
     if (isCurrentlyOpen) {
       // If we just closed the right sidebar, also close the left sidebar
       if (!pastChatsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(pastChatsRow, pastChatsToggle, true);
+        toggleSidebar(pastChatsRow, pastChatsToggle);
       }
     } else {
       // If we just opened the right sidebar, also open the left sidebar
       if (pastChatsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(pastChatsRow, pastChatsToggle, false);
+        toggleSidebar(pastChatsRow, pastChatsToggle);
       }
     }
   }
@@ -890,7 +829,7 @@ if (isMobile()) {
   const textarea = document.querySelector("#chat-input textarea");
 
   if (textarea) {
-    // Simulate adding and removing a newline
+    // Force textarea height recalculation by simulating content change
     textarea.value += "\n";
     textarea.dispatchEvent(new Event("input", { bubbles: true }));
     textarea.value = textarea.value.slice(0, -1);
diff --git a/js/save_files.js b/js/save_files.js
index bdb0e334..c3cbf9ff 100644
--- a/js/save_files.js
+++ b/js/save_files.js
@@ -1,10 +1,9 @@
 // Functions for downloading JSON files
 function getCurrentTimestamp() {
   const now = new Date();
-  const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert to milliseconds
+  const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert minutes to milliseconds
   const localTime = new Date(now.getTime() - timezoneOffset);
-  const formattedTimestamp = localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
-  return formattedTimestamp;
+  return localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
 }
 
 function saveFile(contents, filename) {
@@ -18,23 +17,18 @@ function saveFile(contents, filename) {
 }
 
 function saveHistory(history, character, mode) {
-  let path = null;
+  let path;
 
   if (["chat", "chat-instruct"].includes(mode) && character && character.trim() !== "") {
     path = `history_${character}_${getCurrentTimestamp()}.json`;
   } else {
-    try {
-      path = `history_${mode}_${getCurrentTimestamp()}.json`;
-    } catch (error) {
-      path = `history_${getCurrentTimestamp()}.json`;
-    }
+    path = `history_${mode || "unknown"}_${getCurrentTimestamp()}.json`;
   }
+
   saveFile(history, path);
 }
 
 function saveSession(session) {
-  let path = null;
-
-  path = `session_${getCurrentTimestamp()}.json`;
+  const path = `session_${getCurrentTimestamp()}.json`;
   saveFile(session, path);
 }
diff --git a/js/show_controls.js b/js/show_controls.js
index ff513395..d5642dc4 100644
--- a/js/show_controls.js
+++ b/js/show_controls.js
@@ -1,13 +1,11 @@
-const chatParent = document.querySelector(".chat-parent");
-
 function toggle_controls(value) {
+  const navToggle = document.getElementById("navigation-toggle");
+  const pastChatsToggle = document.getElementById("past-chats-toggle");
   const extensions = document.querySelector("#extensions");
+  const galleryExtension = document.getElementById("gallery-extension");
 
   if (value) {
     // SHOW MODE: Click toggles to show hidden sidebars
-    const navToggle = document.getElementById("navigation-toggle");
-    const pastChatsToggle = document.getElementById("past-chats-toggle");
-
     if (navToggle && document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
       navToggle.click();
     }
@@ -19,17 +17,11 @@ function toggle_controls(value) {
     if (extensions) {
       extensions.style.display = "inherit";
     }
-
-    let gallery_element = document.getElementById("gallery-extension");
-    if (gallery_element) {
-      gallery_element.style.display = "block";
+    if (galleryExtension) {
+      galleryExtension.style.display = "block";
     }
-
   } else {
     // HIDE MODE: Click toggles to hide visible sidebars
-    const navToggle = document.getElementById("navigation-toggle");
-    const pastChatsToggle = document.getElementById("past-chats-toggle");
-
     if (navToggle && !document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
       navToggle.click();
     }
@@ -41,5 +33,8 @@ function toggle_controls(value) {
     if (extensions) {
       extensions.style.display = "none";
     }
+    if (galleryExtension) {
+      galleryExtension.style.display = "none";
+    }
   }
 }
diff --git a/js/switch_tabs.js b/js/switch_tabs.js
index 36e5736b..a1b44ef3 100644
--- a/js/switch_tabs.js
+++ b/js/switch_tabs.js
@@ -2,17 +2,9 @@ function scrollToTop() {
   window.scrollTo({ top: 0 });
 }
 
-function findButtonsByText(buttonText) {
-  const buttons = document.getElementsByTagName("button");
-  const matchingButtons = [];
-
-  for (let i = 0; i < buttons.length; i++) {
-    if (buttons[i].textContent.trim() === buttonText) {
-      matchingButtons.push(buttons[i]);
-    }
-  }
-
-  return matchingButtons;
+function findButtonsByText(buttonText, container = document) {
+  return Array.from(container.getElementsByTagName("button"))
+    .filter(btn => btn.textContent.trim() === buttonText);
 }
 
 function switch_to_chat() {
@@ -39,13 +31,9 @@ function switch_to_character() {
 
 function switch_to_image_ai_generate() {
   const container = document.querySelector("#image-ai-tab");
-  const buttons = container.getElementsByTagName("button");
-
-  for (let i = 0; i < buttons.length; i++) {
-    if (buttons[i].textContent.trim() === "Generate") {
-      buttons[i].click();
-      break;
-    }
+  const generateBtn = findButtonsByText("Generate", container)[0];
+  if (generateBtn) {
+    generateBtn.click();
   }
 
   scrollToTop();
diff --git a/js/update_big_picture.js b/js/update_big_picture.js
index ec51d63b..8f638c99 100644
--- a/js/update_big_picture.js
+++ b/js/update_big_picture.js
@@ -1,7 +1,6 @@
 function updateBigPicture() {
   var existingElement = document.querySelector(".bigProfilePicture");
   if (existingElement) {
-    var timestamp = new Date().getTime();
-    existingElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
+    existingElement.src = getProfilePictureUrl();
   }
 }
diff --git a/modules/extensions.py b/modules/extensions.py
index 09db9f40..afe847f0 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -191,21 +191,19 @@ def _apply_custom_generate_reply():
 
 
 def _apply_custom_css():
-    all_css = ''
-    for extension, _ in iterator():
-        if hasattr(extension, 'custom_css'):
-            all_css += getattr(extension, 'custom_css')()
-
-    return all_css
+    return ''.join(
+        getattr(extension, 'custom_css')()
+        for extension, _ in iterator()
+        if hasattr(extension, 'custom_css')
+    )
 
 
 def _apply_custom_js():
-    all_js = ''
-    for extension, _ in iterator():
-        if hasattr(extension, 'custom_js'):
-            all_js += getattr(extension, 'custom_js')()
-
-    return all_js
+    return ''.join(
+        getattr(extension, 'custom_js')()
+        for extension, _ in iterator()
+        if hasattr(extension, 'custom_js')
+    )
 
 
 def create_extensions_block():

From 71c1a52afe54ab599ab5849ae80f1d5a3a72fb5a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 30 Mar 2026 20:49:38 -0700
Subject: [PATCH 12/27] API: Implement echo + logprobs for /v1/completions
 endpoint

---
 modules/api/completions.py  | 299 ++++++++++++++++++++++++++++++------
 modules/exllamav3.py        |  26 +++-
 modules/llama_cpp_server.py |  39 ++++-
 3 files changed, 309 insertions(+), 55 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 8948bb86..587ad6ea 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -39,6 +39,129 @@ def load_chat_template_file(filepath):
     return text
 
 
+def _first_token_display_str(token_id, prompt, tokenizer):
+    """Return the display string for the first prompt token.
+
+    Returns empty string for BOS or tokens that don't appear at the start
+    of the prompt text, so they don't shift text_offset for subsequent tokens.
+    """
+    token_id = int(token_id)
+    bos_id = getattr(tokenizer, 'bos_token_id', None)
+    if bos_id is not None and token_id == bos_id:
+        return ""
+
+    import torch
+    tok = tokenizer.decode(torch.tensor([token_id]))
+    if not prompt.startswith(tok):
+        return ""
+
+    return tok
+
+
+def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
+    """Compute logprob entries for prompt tokens via a forward pass.
+
+    Returns a list of logprob entries in the standard format.
+    The first token gets a null entry (no conditioning context).
+
+    Supported for HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
+    via a single forward pass, and for llama.cpp via the server's
+    prompt_logprobs parameter. Returns [] for unsupported loaders.
+    """
+    if input_ids is None:
+        input_ids = encode(prompt)  # (1, seq_len) tensor or array
+
+    token_ids = input_ids[0]
+    n_tokens = len(token_ids)
+
+    if n_tokens == 0:
+        return []
+
+    loader = shared.args.loader
+    model = shared.model
+
+    if loader == 'llama.cpp':
+        return model.get_prompt_logprob_entries(token_ids, max(logprobs_count, 1), prompt=prompt)
+
+    first_token_str = _first_token_display_str(token_ids[0], prompt, shared.tokenizer)
+
+    if n_tokens <= 1:
+        return [{"token": first_token_str, "null_logprob": True}]
+
+    import torch
+
+    if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
+        # Native ExLlamav3: call the underlying Model.forward() directly
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        with torch.no_grad():
+            logits = model.model.forward(
+                input_ids=input_ids_tensor,
+                params={
+                    "attn_mode": "flash_attn",
+                    "cache": model.cache,
+                    "past_len": 0,
+                    "batch_shape": (1, model.max_tokens),
+                }
+            ).float().cpu()
+
+    elif hasattr(model, 'forward'):
+        # HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        if hasattr(model, 'device'):
+            input_ids_tensor = input_ids_tensor.to(model.device)
+        with torch.no_grad():
+            # Pass labels to ensure logits are returned for ALL positions,
+            # not just the last token (some HF wrappers like ExLlamav3_HF
+            # only compute the last-token logits when labels are absent).
+            outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor)
+            logits = outputs.logits.float().cpu()
+
+    else:
+        return []
+
+    entries = [{"token": first_token_str, "null_logprob": True}]
+
+    # Batch logsumexp and topk as single operations across all positions
+    # to avoid per-position kernel launch overhead.
+    prompt_logits = logits[0, :n_tokens - 1]  # positions 0..n-2 predict tokens 1..n-1
+    k = min(logprobs_count, prompt_logits.shape[-1])
+    all_top_values, all_top_indices = torch.topk(prompt_logits, k=k, dim=-1)
+    all_lse = torch.logsumexp(prompt_logits, dim=-1)
+    all_top_log_probs = all_top_values - all_lse.unsqueeze(-1)
+
+    # Batch-decode all unique token IDs to avoid O(N*k) individual decode calls
+    unique_ids = set(int(tid) for tid in token_ids[1:])
+    unique_ids.update(int(tid) for tid in all_top_indices.flatten().tolist())
+
+    decoded_strs = {tid: shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids}
+
+    for i in range(1, n_tokens):
+        token_id = int(token_ids[i])
+        idx = i - 1
+        top_log_probs = all_top_log_probs[idx]
+        top_ids = all_top_indices[idx].tolist()
+        actual_token_str = decoded_strs[token_id]
+
+        # Build the top list with the actual prompt token guaranteed at front
+        if token_id in top_ids:
+            actual_lp = top_log_probs[top_ids.index(token_id)].item()
+            alternatives = [
+                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                for j in range(k) if top_ids[j] != token_id
+            ]
+        else:
+            actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
+            alternatives = [
+                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                for j in range(k - 1)  # drop lowest to make room
+            ]
+
+        entry = {"top_logprobs": [{"token": actual_token_str, "logprob": actual_lp}] + alternatives}
+        entries.append(entry)
+
+    return entries
+
+
 def _get_raw_logprob_entries(offset=0):
     """Get raw logprob entries from llama.cpp/ExLlamav3 backend, starting from offset.
 
@@ -65,6 +188,21 @@ def _parse_entry_top(entry):
     return entry.get('top_logprobs', entry.get('top_probs', []))
 
 
+def _extract_sampled_token(entry, top):
+    """Get the actually sampled token and its logprob from a logprob entry.
+
+    Uses the entry-level token/logprob when available (the actually sampled
+    token), falling back to top[0] (highest-probability alternative) which
+    may differ with non-greedy sampling.
+    """
+    if 'token' in entry:
+        return entry['token'], entry.get('logprob', entry.get('prob', 0))
+
+    token_str = top[0].get('token', '')
+    token_logprob = top[0].get('logprob', top[0].get('prob', 0))
+    return token_str, token_logprob
+
+
 def format_chat_logprobs(entries):
     """Format logprob entries into OpenAI chat completions logprobs format.
 
@@ -79,9 +217,7 @@ def format_chat_logprobs(entries):
         if not top:
             continue
 
-        chosen = top[0]
-        token_str = chosen.get('token', '')
-        token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+        token_str, token_logprob = _extract_sampled_token(entry, top)
 
         top_list = []
         for item in top:
@@ -118,13 +254,21 @@ def format_completion_logprobs(entries):
     offset = 0
 
     for entry in entries:
+        # Handle null logprob entries (first prompt token with echo)
+        if entry.get("null_logprob"):
+            token_str = entry.get("token", "")
+            tokens.append(token_str)
+            token_logprobs.append(None)
+            top_logprobs.append(None)
+            text_offset.append(offset)
+            offset += len(token_str)
+            continue
+
         top = _parse_entry_top(entry)
         if not top:
             continue
 
-        chosen = top[0]
-        token_str = chosen.get('token', '')
-        token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+        token_str, token_logprob = _extract_sampled_token(entry, top)
 
         tokens.append(token_str)
         token_logprobs.append(token_logprob)
@@ -407,7 +551,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     })
 
     max_tokens = generate_params['max_new_tokens']
-    if max_tokens in [None, 0]:
+    if max_tokens is not None and max_tokens <= 0:
+        raise InvalidRequestError(message="max_tokens must be greater than 0.", param="max_tokens")
+
+    if max_tokens is None:
         generate_params['max_new_tokens'] = 512
         generate_params['auto_max_new_tokens'] = True
 
@@ -652,6 +799,15 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
     # common params
     generate_params = process_parameters(body, is_legacy=is_legacy)
     max_tokens = generate_params['max_new_tokens']
+    if max_tokens is None:
+        generate_params['max_new_tokens'] = 512
+        generate_params['auto_max_new_tokens'] = True
+        max_tokens = 512
+    elif max_tokens < 0:
+        raise InvalidRequestError(message="max_tokens must be greater than or equal to 0.", param="max_tokens")
+    elif max_tokens == 0 and body.get('logprobs') is None:
+        raise InvalidRequestError(message="max_tokens is 0 but no logprobs parameter was specified.", param="max_tokens")
+
     generate_params['stream'] = stream
     if stop_event is not None:
         generate_params['stop_event'] = stop_event
@@ -700,9 +856,17 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                         prompt = decode(prompt)[0]
 
             prefix = prompt if echo else ''
-            token_count = len(encode(prompt)[0])
+            prompt_input_ids = encode(prompt)
+            token_count = len(prompt_input_ids[0])
             total_prompt_token_count += token_count
 
+            # Compute prompt logprobs once per prompt (shared across n_completions)
+            logprobs_val = body.get('logprobs', None)
+            if echo and logprobs_val is not None and logprobs_val >= 0:
+                prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
+            else:
+                prompt_entries = None
+
             original_seed = generate_params.get('seed', -1)
             for _n in range(n_completions):
                 # Increment seed for each completion to ensure diversity (matches llama.cpp native behavior)
@@ -713,29 +877,41 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                     logprob_proc.token_alternatives_history.clear()
 
                 # generate reply #######################################
-                debug_msg({'prompt': prompt, 'generate_params': generate_params})
-                generator = generate_reply(prompt, generate_params, is_chat=False)
-                answer = ''
-
-                for a in generator:
-                    answer = a
-
-                completion_token_count = len(encode(answer)[0])
-                total_completion_token_count += completion_token_count
-                stop_reason = "stop"
-                if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
-                    stop_reason = "length"
-
-                if logprob_proc:
-                    all_entries = []
-                    for alt in logprob_proc.token_alternatives_history:
-                        all_entries.extend(_dict_to_logprob_entries(alt))
-                    completion_logprobs = format_completion_logprobs(all_entries)
-                elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
-                    raw = getattr(shared.model, 'last_completion_probabilities', None)
-                    completion_logprobs = format_completion_logprobs(raw)
+                if max_tokens == 0:
+                    answer = ''
+                    completion_token_count = 0
+                    stop_reason = "stop"
                 else:
-                    completion_logprobs = None
+                    debug_msg({'prompt': prompt, 'generate_params': generate_params})
+                    generator = generate_reply(prompt, generate_params, is_chat=False)
+                    answer = ''
+
+                    for a in generator:
+                        answer = a
+
+                    completion_token_count = len(encode(answer)[0])
+                    stop_reason = "stop"
+                    if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+                        stop_reason = "length"
+
+                total_completion_token_count += completion_token_count
+
+                if max_tokens == 0:
+                    all_entries = []
+                else:
+                    if logprob_proc:
+                        all_entries = []
+                        for alt in logprob_proc.token_alternatives_history:
+                            all_entries.extend(_dict_to_logprob_entries(alt))
+                    elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
+                        all_entries = getattr(shared.model, 'last_completion_probabilities', None) or []
+                    else:
+                        all_entries = []
+
+                if prompt_entries:
+                    all_entries = prompt_entries + all_entries
+
+                completion_logprobs = format_completion_logprobs(all_entries) if all_entries else None
 
                 respi = {
                     "index": choice_index,
@@ -775,7 +951,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                 raise InvalidRequestError(message="API Batched generation not yet supported.", param=prompt_str)
 
         prefix = prompt if echo else ''
-        token_count = len(encode(prompt)[0])
+        prompt_input_ids = encode(prompt)
+        token_count = len(prompt_input_ids[0])
 
         # Check if usage should be included in streaming chunks per OpenAI spec
         stream_options = body.get('stream_options')
@@ -808,37 +985,57 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
 
             return chunk
 
+        logprobs_val = body.get('logprobs', None)
+        if echo and logprobs_val is not None and logprobs_val >= 0:
+            prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
+            prompt_logprobs_formatted = format_completion_logprobs(prompt_entries) if prompt_entries else None
+        else:
+            prompt_logprobs_formatted = None
+
+        # Clear stale logprobs from any previous request before building the
+        # first chunk, so text_streaming_chunk doesn't pick up old data.
+        if hasattr(shared.model, 'last_completion_probabilities'):
+            shared.model.last_completion_probabilities = []
+        cmpl_logprobs_offset[0] = 0
+
         chunk = text_streaming_chunk(prefix)
+        if prompt_logprobs_formatted is not None:
+            chunk[resp_list][0]["logprobs"] = prompt_logprobs_formatted
         if include_usage:
             chunk['usage'] = None
         yield chunk
 
         # generate reply #######################################
-        debug_msg({'prompt': prompt, 'generate_params': generate_params})
-        generator = generate_reply(prompt, generate_params, is_chat=False)
-        answer = ''
-        seen_content = ''
-        completion_token_count = 0
+        if max_tokens == 0:
+            answer = ''
+            completion_token_count = 0
+            stop_reason = "stop"
+        else:
+            debug_msg({'prompt': prompt, 'generate_params': generate_params})
+            generator = generate_reply(prompt, generate_params, is_chat=False)
+            answer = ''
+            seen_content = ''
+            completion_token_count = 0
 
-        for a in generator:
-            answer = a
+            for a in generator:
+                answer = a
 
-            len_seen = len(seen_content)
-            new_content = answer[len_seen:]
+                len_seen = len(seen_content)
+                new_content = answer[len_seen:]
 
-            if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
-                continue
+                if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
+                    continue
 
-            seen_content = answer
-            chunk = text_streaming_chunk(new_content)
-            if include_usage:
-                chunk['usage'] = None
-            yield chunk
+                seen_content = answer
+                chunk = text_streaming_chunk(new_content)
+                if include_usage:
+                    chunk['usage'] = None
+                yield chunk
 
-        completion_token_count = len(encode(answer)[0])
-        stop_reason = "stop"
-        if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
-            stop_reason = "length"
+            completion_token_count = len(encode(answer)[0])
+            stop_reason = "stop"
+            if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+                stop_reason = "length"
 
         chunk = text_streaming_chunk(suffix)
         chunk[resp_list][0]["finish_reason"] = stop_reason
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index f873503a..3782a693 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -489,15 +489,35 @@ class Exllamav3Model:
             return
 
         id_to_piece = self.tokenizer.get_id_to_piece_list(True)
+        sampled_ids = result.get("token_ids")    # (batch, seq_len) - actually sampled tokens
+        sampled_probs = result.get("token_probs")  # (batch, seq_len) - their probabilities
+
+        def _piece(tid):
+            s = id_to_piece[tid] if tid < len(id_to_piece) else f"<{tid}>"
+            return s.replace('\u2581', ' ')
+
+        def _logprob(prob):
+            return math.log(prob) if prob > 0 else float("-inf")
+
         # top_k_tokens shape: (batch, seq_len, k), top_k_probs same
         for seq_idx in range(top_k_tokens.shape[1]):
             entry = {"top_logprobs": []}
             for k_idx in range(top_k_tokens.shape[2]):
                 token_id = top_k_tokens[0, seq_idx, k_idx].item()
                 prob = top_k_probs[0, seq_idx, k_idx].item()
-                token_str = id_to_piece[token_id] if token_id < len(id_to_piece) else f"<{token_id}>"
-                logprob = math.log(prob) if prob > 0 else float("-inf")
-                entry["top_logprobs"].append({"token": token_str, "logprob": logprob})
+                entry["top_logprobs"].append({"token": _piece(token_id), "logprob": _logprob(prob)})
+
+            # Record the actually sampled token at the entry level so
+            # format_completion_logprobs uses it instead of top_logprobs[0]
+            # (they differ with non-greedy sampling).
+            if sampled_ids is not None:
+                sid = sampled_ids[0, seq_idx].item()
+                entry["token"] = _piece(sid)
+                if sampled_probs is not None:
+                    entry["logprob"] = _logprob(sampled_probs[0, seq_idx].item())
+                else:
+                    entry["logprob"] = None
+
             self.last_completion_probabilities.append(entry)
 
     def generate(self, prompt, state):
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index fa968be1..34080466 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -310,8 +310,45 @@ class LlamaServer:
         else:
             raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
 
+    def get_prompt_logprob_entries(self, token_ids, n_probs=5, prompt=""):
+        """Get logprob entries for prompt tokens via a single n_predict=0 request.
+
+        Requires llama.cpp server with prompt_logprobs support.
+        Returns entries in the standard format for format_completion_logprobs().
+        """
+        token_ids_list = token_ids.tolist() if hasattr(token_ids, 'tolist') else list(token_ids)
+
+        url = f"http://127.0.0.1:{self.port}/completion"
+        payload = {
+            "prompt": token_ids_list,
+            "n_predict": 0,
+            "n_probs": n_probs,
+            "prompt_logprobs": True,
+            "stream": False,
+            "cache_prompt": False,
+        }
+
+        response = self.session.post(url, json=payload)
+        result = response.json()
+
+        prompt_probs = result.get("prompt_probabilities", [])
+        if not prompt_probs:
+            return []
+
+        # Null first token (no conditioning context); use empty string for BOS
+        # or tokens that don't appear at the start of the prompt text.
+        first_token_str = self.decode([token_ids_list[0]])
+        if self.bos_token and first_token_str == self.bos_token:
+            first_token_str = ""
+        elif not prompt.startswith(first_token_str):
+            first_token_str = ""
+
+        entries = [{"token": first_token_str, "null_logprob": True}]
+        entries.extend(prompt_probs)
+        return entries
+
     def _get_vocabulary_size(self):
-        """Get and store the model's maximum context length."""
+        """Get and store the model's vocabulary size."""
         url = f"http://127.0.0.1:{self.port}/v1/models"
         response = self.session.get(url).json()
 

From 328534b762f22c82b09babf6b04e289eab4a7fde Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 12:51:07 -0700
Subject: [PATCH 13/27] Update llama.cpp

---
 requirements/full/requirements.txt                   | 8 ++++----
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 8 ++++----
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 12 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 6e11dd2f..57991c9a 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c964eff6..bb47ea4b 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b1dd6a4f..5750b109 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 4d03d280..d8302d3d 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 9d41d069..d3a5c008 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,7 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index ff80b6c8..1180b42d 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 318044da..57aa6262 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 1676bffb..894c9199 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 27fc2da8..32b9727f 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 0bbdd30a..73b72832 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index c3ae3c57..ad96bbe2 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index e646c04c..a5df3ad4 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 4073164be0b305d8ac4a01d4259448370d009a99 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 19:08:37 -0700
Subject: [PATCH 14/27] Fix ExLlamav3 OOM on prompt logprobs and qwen3_5_moe HF
 compat

---
 modules/api/completions.py | 13 +++++--------
 modules/exllamav3.py       | 33 ++++-----------------------------
 modules/exllamav3_hf.py    | 32 ++++++++------------------------
 3 files changed, 17 insertions(+), 61 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 587ad6ea..a15e1f86 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -91,17 +91,14 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
     import torch
 
     if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
-        # Native ExLlamav3: call the underlying Model.forward() directly
+        # Native ExLlamav3: call the underlying Model.forward() in chunks
+        # to avoid OOM from giant logits tensors (seq_len * vocab_size * 4 bytes)
         input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        input_ids_tensor = input_ids_tensor.view(-1).cpu()
         with torch.no_grad():
             logits = model.model.forward(
-                input_ids=input_ids_tensor,
-                params={
-                    "attn_mode": "flash_attn",
-                    "cache": model.cache,
-                    "past_len": 0,
-                    "batch_shape": (1, model.max_tokens),
-                }
+                input_ids=input_ids_tensor.view(1, -1),
+                params={"attn_mode": "flash_attn_nc"}
             ).float().cpu()
 
     elif hasattr(model, 'forward'):
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 3782a693..7556a908 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -530,39 +530,14 @@ class Exllamav3Model:
     def get_logits(self, token_ids, **kwargs):
         """
         Process a batch of token_ids and return the logits for the last token.
-        This will reset and overwrite the model's cache.
+        Uses flash_attn_nc (no cache) for correct results with recurrent models.
         """
-        # Initialize a single params dictionary that will be updated in-place
-        params = {
-            "cache": self.cache,
-            "reconstruct": False,
-            "attn_mode": "flash_attn",
-            "batch_shape": (1, self.max_tokens),
-            "past_len": 0
-        }
-        params.update(kwargs)
-
-        # Process prefix tokens to fill the cache and generate recurrent state
-        if token_ids.shape[-1] > 1:
-            prefix_ids = token_ids[:, :-1]
-
-            # This forward call updates the 'params' dict with the recurrent state
-            self.model.forward(
-                input_ids=prefix_ids,
-                params=params
-            )
-
-            # Update past_len for the next call
-            params["past_len"] = prefix_ids.shape[-1]
-
-        # Process the last token, now using the state-filled 'params' dict
-        last_token_ids = token_ids[:, -1:]
         logits = self.model.forward(
-            input_ids=last_token_ids,
-            params=params
+            input_ids=token_ids,
+            params={"attn_mode": "flash_attn_nc"}
         )
 
-        return logits.float().cpu()
+        return logits[:, -1:, :].float().cpu()
 
     def encode(self, string, **kwargs):
         add_bos = kwargs.pop('add_bos', True)
diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index e0ad5002..5e634e22 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -26,6 +26,9 @@ except Exception:
 class Exllamav3HF(PreTrainedModel, GenerationMixin):
     def __init__(self, model_dir):
         hf_config = PretrainedConfig.from_pretrained(model_dir)
+        # Ensure text_config is a proper object, not a dict (fixes qwen3_5_moe + transformers compat)
+        if isinstance(getattr(hf_config, 'text_config', None), dict):
+            hf_config.text_config = PretrainedConfig(**hf_config.text_config)
         super().__init__(hf_config)
 
         exl3_config = Config.from_directory(model_dir)
@@ -199,30 +202,11 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
                 }
             ).to(input_ids.device).float()
         else:
-            # Labels path: use cache for cross-chunk attention.
-            tokens_to_process = seq_tensor
-            all_logits = None
-            current_len = 0
-
-            for i in range(0, tokens_to_process.shape[0], max_chunk_size):
-                chunk = tokens_to_process[i:i + max_chunk_size]
-                chunk_logits = self.ex_model.forward(
-                    input_ids=chunk.view(1, -1),
-                    params={
-                        "attn_mode": "flash_attn",
-                        "cache": ex_cache,
-                        "past_len": current_len,
-                        "batch_shape": (1, self.max_tokens),
-                    }
-                ).float()
-                current_len += chunk.shape[0]
-
-                if all_logits is None:
-                    all_logits = chunk_logits
-                else:
-                    all_logits = torch.cat([all_logits, chunk_logits], dim=1)
-
-            logits = all_logits
+            # Labels path: single pass without cache for correct logits
+            logits = self.ex_model.forward(
+                input_ids=seq_tensor.view(1, -1),
+                params={"attn_mode": "flash_attn_nc"}
+            ).float().cpu()
 
         if is_negative:
             self.past_seq_negative = seq_tensor

From a32ce254f275efe473d6624995957b3b6bd51aa1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 20:28:44 -0700
Subject: [PATCH 15/27] Don't pass torch_dtype to transformers, autodetect from
 model config

---
 modules/transformers_loader.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py
index 7f521b8c..5964f012 100644
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@@ -109,7 +109,6 @@ def load_model_HF(model_name):
     params = {
         'low_cpu_mem_usage': True,
         'attn_implementation': shared.args.attn_implementation,
-        'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
     }
 
     if shared.original_args.trust_remote_code:
@@ -120,6 +119,17 @@ def load_model_HF(model_name):
 
     config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.original_args.trust_remote_code)
 
+    # Determine torch_dtype: respect --bf16 flag, otherwise autodetect
+    # from model config, but never allow float32.
+    if shared.args.bf16:
+        params['torch_dtype'] = torch.bfloat16
+    else:
+        dtype = getattr(config, 'torch_dtype', None) or getattr(getattr(config, 'text_config', None), 'torch_dtype', None)
+        if dtype in (torch.float16, torch.bfloat16):
+            params['torch_dtype'] = dtype
+        else:
+            params['torch_dtype'] = torch.float16
+
     if 'chatglm' in model_name.lower():
         LoaderClass = AutoModel
     else:

From c10c6e87ae0b0085b36e7e13269461744ce04ff6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 07:17:27 -0700
Subject: [PATCH 16/27] API: Add token ids to logprobs output

---
 modules/api/completions.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index a15e1f86..453fa07b 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -143,17 +143,17 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         if token_id in top_ids:
             actual_lp = top_log_probs[top_ids.index(token_id)].item()
             alternatives = [
-                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
                 for j in range(k) if top_ids[j] != token_id
             ]
         else:
             actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
             alternatives = [
-                {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+                {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
                 for j in range(k - 1)  # drop lowest to make room
             ]
 
-        entry = {"top_logprobs": [{"token": actual_token_str, "logprob": actual_lp}] + alternatives}
+        entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives}
         entries.append(entry)
 
     return entries
@@ -239,7 +239,7 @@ def format_chat_logprobs(entries):
 def format_completion_logprobs(entries):
     """Format logprob entries into OpenAI completions logprobs format.
 
-    Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "text_offset"}
+    Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "top_logprobs_ids": [{token_id: prob}], "text_offset"}
     """
     if not entries:
         return None
@@ -247,6 +247,7 @@ def format_completion_logprobs(entries):
     tokens = []
     token_logprobs = []
     top_logprobs = []
+    top_logprobs_ids = []
     text_offset = []
     offset = 0
 
@@ -257,6 +258,7 @@ def format_completion_logprobs(entries):
             tokens.append(token_str)
             token_logprobs.append(None)
             top_logprobs.append(None)
+            top_logprobs_ids.append(None)
             text_offset.append(offset)
             offset += len(token_str)
             continue
@@ -273,21 +275,28 @@ def format_completion_logprobs(entries):
         offset += len(token_str)
 
         top_dict = {}
+        top_dict_ids = {}
         for item in top:
             t = item.get('token', '')
             lp = item.get('logprob', item.get('prob', 0))
             top_dict[t] = lp
+            if 'token_id' in item:
+                top_dict_ids[item['token_id']] = lp
         top_logprobs.append(top_dict)
+        top_logprobs_ids.append(top_dict_ids if top_dict_ids else None)
 
     if not tokens:
         return None
 
-    return {
+    result = {
         "tokens": tokens,
         "token_logprobs": token_logprobs,
         "top_logprobs": top_logprobs,
         "text_offset": text_offset
     }
+    if any(x is not None for x in top_logprobs_ids):
+        result["top_logprobs_ids"] = top_logprobs_ids
+    return result
 
 
 def process_parameters(body, is_legacy=False):

From ea1f8c71f2e92dc9ae230b943c605e43ff5c633c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:30:59 -0300
Subject: [PATCH 17/27] API: Optimize prompt logprobs and refactor ExLlamav3
 forward pass

---
 modules/api/completions.py | 69 ++++++++++++++++++++++++--------------
 modules/exllamav3.py       | 14 ++++++++
 2 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 453fa07b..4eb8fdad 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -90,16 +90,8 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
 
     import torch
 
-    if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
-        # Native ExLlamav3: call the underlying Model.forward() in chunks
-        # to avoid OOM from giant logits tensors (seq_len * vocab_size * 4 bytes)
-        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
-        input_ids_tensor = input_ids_tensor.view(-1).cpu()
-        with torch.no_grad():
-            logits = model.model.forward(
-                input_ids=input_ids_tensor.view(1, -1),
-                params={"attn_mode": "flash_attn_nc"}
-            ).float().cpu()
+    if hasattr(model, 'get_prompt_logits'):
+        logits = model.get_prompt_logits(input_ids)
 
     elif hasattr(model, 'forward'):
         # HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
@@ -111,26 +103,54 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
             # not just the last token (some HF wrappers like ExLlamav3_HF
             # only compute the last-token logits when labels are absent).
             outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor)
-            logits = outputs.logits.float().cpu()
+            logits = outputs.logits  # keep on GPU, (1, seq_len, vocab) in model dtype
+            del outputs
 
     else:
         return []
 
     entries = [{"token": first_token_str, "null_logprob": True}]
 
-    # Batch logsumexp and topk as single operations across all positions
-    # to avoid per-position kernel launch overhead.
-    prompt_logits = logits[0, :n_tokens - 1]  # positions 0..n-2 predict tokens 1..n-1
-    k = min(logprobs_count, prompt_logits.shape[-1])
-    all_top_values, all_top_indices = torch.topk(prompt_logits, k=k, dim=-1)
-    all_lse = torch.logsumexp(prompt_logits, dim=-1)
-    all_top_log_probs = all_top_values - all_lse.unsqueeze(-1)
-
-    # Batch-decode all unique token IDs to avoid O(N*k) individual decode calls
+    logprobs_count = max(logprobs_count, 1)
+    k = min(logprobs_count, logits.shape[-1])
+    chunk_size = 2048
     unique_ids = set(int(tid) for tid in token_ids[1:])
-    unique_ids.update(int(tid) for tid in all_top_indices.flatten().tolist())
 
-    decoded_strs = {tid: shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids}
+    # Process logits in chunks on GPU, only move top-K results to CPU
+    all_top_log_probs_list = []
+    all_top_indices_list = []
+    all_actual_lps = []
+
+    for start in range(0, n_tokens - 1, chunk_size):
+        end = min(start + chunk_size, n_tokens - 1)
+        chunk_logits = logits[0, start:end].float()  # (chunk, vocab) on GPU
+        chunk_lse = torch.logsumexp(chunk_logits, dim=-1)
+        chunk_top_values, chunk_top_indices = torch.topk(chunk_logits, k=k, dim=-1)
+        chunk_top_log_probs = chunk_top_values - chunk_lse.unsqueeze(-1)
+
+        # Compute logprob for actual next tokens in this chunk
+        chunk_top_sets = [set(chunk_top_indices[j].tolist()) for j in range(end - start)]
+        for j in range(end - start):
+            actual_tid = int(token_ids[start + j + 1])
+            if actual_tid not in chunk_top_sets[j]:
+                all_actual_lps.append((chunk_logits[j, actual_tid] - chunk_lse[j]).item())
+            else:
+                all_actual_lps.append(None)  # will use top_log_probs
+
+        all_top_log_probs_list.append(chunk_top_log_probs.cpu())
+        all_top_indices_list.append(chunk_top_indices.cpu())
+        unique_ids.update(int(tid) for tid in chunk_top_indices.flatten().tolist())
+        del chunk_logits, chunk_lse, chunk_top_values
+
+    del logits
+    torch.cuda.empty_cache()
+
+    all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0)
+    all_top_indices = torch.cat(all_top_indices_list, dim=0)
+
+    unique_ids_list = sorted(unique_ids)
+    decoded_list = shared.tokenizer.batch_decode([[tid] for tid in unique_ids_list]) if hasattr(shared.tokenizer, 'batch_decode') else [shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids_list]
+    decoded_strs = dict(zip(unique_ids_list, decoded_list))
 
     for i in range(1, n_tokens):
         token_id = int(token_ids[i])
@@ -139,7 +159,6 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         top_ids = all_top_indices[idx].tolist()
         actual_token_str = decoded_strs[token_id]
 
-        # Build the top list with the actual prompt token guaranteed at front
         if token_id in top_ids:
             actual_lp = top_log_probs[top_ids.index(token_id)].item()
             alternatives = [
@@ -147,10 +166,10 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
                 for j in range(k) if top_ids[j] != token_id
             ]
         else:
-            actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
+            actual_lp = all_actual_lps[idx]
             alternatives = [
                 {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
-                for j in range(k - 1)  # drop lowest to make room
+                for j in range(k - 1)
             ]
 
         entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives}
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 7556a908..e1efbfeb 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -527,6 +527,20 @@ class Exllamav3Model:
 
         return output
 
+    def get_prompt_logits(self, input_ids):
+        """Return logits for all positions via a single no-cache forward pass.
+
+        Used by prompt logprobs computation. Returns (1, seq_len, vocab) on CPU in float32.
+        """
+        import torch
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        input_ids_tensor = input_ids_tensor.view(1, -1).cpu()
+        with torch.no_grad():
+            return self.model.forward(
+                input_ids=input_ids_tensor,
+                params={"attn_mode": "flash_attn_nc"}
+            ).cpu().float()
+
     def get_logits(self, token_ids, **kwargs):
         """
         Process a batch of token_ids and return the logits for the last token.

From c50e17bdbe1da850189188afaf0682a952efa0d1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:49:31 -0300
Subject: [PATCH 18/27] Add dedicated ik portable requirements files and remove
 macOS ik builds

---
 .github/workflows/build-everything-tgw.yml    |  7 ---
 .../build-portable-release-ik-cuda.yml        |  9 ++--
 .../workflows/build-portable-release-ik.yml   | 44 +++----------------
 requirements/portable/requirements_ik.txt     | 27 ++++++++++++
 .../portable/requirements_ik_cpu_only.txt     | 27 ++++++++++++
 .../portable/requirements_ik_cuda131.txt      | 27 ++++++++++++
 6 files changed, 91 insertions(+), 50 deletions(-)
 create mode 100644 requirements/portable/requirements_ik.txt
 create mode 100644 requirements/portable/requirements_ik_cpu_only.txt
 create mode 100644 requirements/portable/requirements_ik_cuda131.txt

diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 4de591f4..40d9db5d 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -96,10 +96,3 @@ jobs:
     with:
       version: ${{ inputs.version }}
       config: 'os:ubuntu-22.04'
-
-  build_release_ik_macos:
-    name: ik macOS
-    uses: ./.github/workflows/build-portable-release-ik.yml
-    with:
-      version: ${{ inputs.version }}
-      config: 'os:macos-14'
diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml
index 40b4b92f..331a7653 100644
--- a/.github/workflows/build-portable-release-ik-cuda.yml
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@@ -138,14 +138,13 @@ jobs:
             # 3. Prepare requirements file based on CUDA version
             cd "text-generation-webui-${VERSION_CLEAN}"
             if [[ "$CUDA_VERSION" == "13.1" ]]; then
-                REQ_FILE="requirements/portable/requirements_cuda131.txt"
+                REQ_FILE="requirements/portable/requirements_ik_cuda131.txt"
             else
-                REQ_FILE="requirements/portable/requirements.txt"
+                REQ_FILE="requirements/portable/requirements_ik.txt"
             fi
 
-            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
-            sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
-            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true
+            # 4. Inject --ik into start scripts
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
 
             # 5. Install packages
             echo "Installing Python packages from $REQ_FILE..."
diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml
index afb2e763..bf54eb0e 100644
--- a/.github/workflows/build-portable-release-ik.yml
+++ b/.github/workflows/build-portable-release-ik.yml
@@ -1,4 +1,4 @@
-name: Build ik CPU and macOS
+name: Build ik CPU
 
 on:
   workflow_dispatch:
@@ -57,7 +57,7 @@ jobs:
         id: set-matrix
         run: |
           $matrix = @{
-              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
+              'os' = @('ubuntu-22.04', 'windows-2022')
               'pyver' = @("3.13")
           }
 
@@ -110,7 +110,6 @@ jobs:
 
             # Define common variables
             VERSION="${{ inputs.version }}"
-            OS_TYPE="${{ matrix.os }}"
 
             # 1. Set platform-specific variables
             if [[ "$RUNNER_OS" == "Windows" ]]; then
@@ -119,21 +118,7 @@ jobs:
                 PIP_PATH="portable_env/python.exe -m pip"
                 PACKAGES_PATH="portable_env/Lib/site-packages"
                 rm start_linux.sh start_macos.sh
-            elif [[ "$RUNNER_OS" == "macOS" ]]; then
-                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
-                    PLATFORM="macos-x86_64"
-                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz"
-                    REQ_TYPE="apple_intel"
-                else
-                    PLATFORM="macos-arm64"
-                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz"
-                    REQ_TYPE="apple_silicon"
-                fi
-                PIP_PATH="portable_env/bin/python -m pip"
-                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
-                rm start_linux.sh start_windows.bat
             else
-                # Linux case
                 PLATFORM="linux-cpu"
                 PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
                 PIP_PATH="portable_env/bin/python -m pip"
@@ -148,30 +133,13 @@ jobs:
             tar -xzf python-build.tar.gz
             mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
 
-            # 3. Prepare requirements file based on platform
+            # 3. Prepare requirements file
             cd "text-generation-webui-${VERSION_CLEAN}"
-
-            # Select requirements file based on platform
-            if [[ "$RUNNER_OS" == "macOS" ]]; then
-                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
-                    REQ_FILE="requirements/portable/requirements_apple_intel.txt"
-                else
-                    REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
-                fi
-            else
-                REQ_FILE="requirements/portable/requirements_cpu_only.txt"
-            fi
-
+            REQ_FILE="requirements/portable/requirements_ik_cpu_only.txt"
             echo "Using requirements file: $REQ_FILE"
 
-            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
-            if [[ "$RUNNER_OS" == "macOS" ]]; then
-                sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
-                sed -i '' 's/--portable/--portable --ik/g' start_macos.sh
-            else
-                sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
-                sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
-            fi
+            # 4. Inject --ik into start scripts
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
 
             # 5. Install packages
             echo "Installing Python packages from $REQ_FILE..."
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
new file mode 100644
index 00000000..2fa037f7
--- /dev/null
+++ b/requirements/portable/requirements_ik.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
new file mode 100644
index 00000000..b43b51c4
--- /dev/null
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# ik_llama.cpp (CPU only)
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
new file mode 100644
index 00000000..12767285
--- /dev/null
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 8f8b57a029715d07ab164aa22a779ea7ea4619f1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:54:20 -0700
Subject: [PATCH 19/27] Update exllamav3

---
 requirements/full/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 57991c9a..5591c9ca 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -44,7 +44,7 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"

From 6a1f720c7bb9aef73c1c7c4e311460174c5255ec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:58:20 -0700
Subject: [PATCH 20/27] Update transformers

---
 requirements/full/requirements.txt               | 2 +-
 requirements/full/requirements_amd.txt           | 2 +-
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_cpu_only.txt      | 2 +-
 requirements/full/requirements_nowheels.txt      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 5591c9ca..30ee0316 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -25,7 +25,7 @@ sentencepiece
 tensorboard
 torchao==0.15.*
 trafilatura==2.0.0
-transformers==5.3.*
+transformers==5.5.*
 triton-windows==3.5.1.post24; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index bb47ea4b..9edc1d95 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 5750b109..ff8687c1 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d8302d3d..208632e8 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index d3a5c008..4a7e5aaa 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 052085cc..6200589e 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -22,7 +22,7 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb

From 468cb5cb87bf02f96efcd5acb1d1ac4b08c68273 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:59:28 -0700
Subject: [PATCH 21/27] Update accelerate

---
 requirements/full/requirements.txt               | 2 +-
 requirements/full/requirements_amd.txt           | 2 +-
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_cpu_only.txt      | 2 +-
 requirements/full/requirements_nowheels.txt      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 30ee0316..e5bec6ec 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 bitsandbytes==0.49.*
 datasets
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 9edc1d95..c6b5b2d0 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ff8687c1..ce671f0a 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 208632e8..d12d9f80 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 4a7e5aaa..4066b1af 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 6200589e..7173345a 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*

From 80e81a54cacacbd8aa16ccf312ae0e574e4b416c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 11:11:44 -0700
Subject: [PATCH 22/27] Remove ik macOS wheels from full requirements

---
 requirements/full/requirements_apple_intel.txt   | 1 -
 requirements/full/requirements_apple_silicon.txt | 1 -
 2 files changed, 2 deletions(-)

diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ce671f0a..55a313e9 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -38,4 +38,3 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d12d9f80..a6d34cbb 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -38,4 +38,3 @@ tiktoken
 
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"

From f6f8f14c8d0993327a2c86dfa3c976a7c1c569fc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:13:39 -0300
Subject: [PATCH 23/27] Security: Fix SSRF in superbooga extensions

---
 extensions/superbooga/download_urls.py   | 3 +++
 extensions/superboogav2/download_urls.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/extensions/superbooga/download_urls.py b/extensions/superbooga/download_urls.py
index 424a9885..b28fea42 100644
--- a/extensions/superbooga/download_urls.py
+++ b/extensions/superbooga/download_urls.py
@@ -2,8 +2,11 @@ import concurrent.futures
 
 import requests
 
+from modules.web_search import _validate_url
+
 
 def download_single(url):
+    _validate_url(url)
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
     }
diff --git a/extensions/superboogav2/download_urls.py b/extensions/superboogav2/download_urls.py
index 5b5a2e17..4d8b98b1 100644
--- a/extensions/superboogav2/download_urls.py
+++ b/extensions/superboogav2/download_urls.py
@@ -5,12 +5,14 @@ import requests
 from bs4 import BeautifulSoup
 
 import extensions.superboogav2.parameters as parameters
+from modules.web_search import _validate_url
 
 from .data_processor import process_and_add_to_collector
 from .utils import create_metadata_source
 
 
 def _download_single(url):
+    _validate_url(url)
     response = requests.get(url, timeout=5)
     if response.status_code == 200:
         return response.content

From 091037ec20743ac6c7bccb75b59743045a692c4a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:13:45 -0300
Subject: [PATCH 24/27] Fix top_logprobs_ids missing for llama.cpp loader

---
 modules/api/completions.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 4eb8fdad..98bcff47 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -299,8 +299,9 @@ def format_completion_logprobs(entries):
             t = item.get('token', '')
             lp = item.get('logprob', item.get('prob', 0))
             top_dict[t] = lp
-            if 'token_id' in item:
-                top_dict_ids[item['token_id']] = lp
+            tid = item.get('token_id', item.get('id'))
+            if tid is not None:
+                top_dict_ids[tid] = lp
         top_logprobs.append(top_dict)
         top_logprobs_ids.append(top_dict_ids if top_dict_ids else None)
 

From a61bde509ff44a0f7662067bc94efd7f103f3162 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:30:02 -0700
Subject: [PATCH 25/27] Update llama.cpp

---
 requirements/full/requirements.txt                   | 8 ++++----
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 2 +-
 requirements/full/requirements_apple_silicon.txt     | 2 +-
 requirements/full/requirements_cpu_only.txt          | 8 ++++----
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 2 +-
 requirements/portable/requirements_apple_silicon.txt | 2 +-
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 15 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index e5bec6ec..f1a953a5 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c6b5b2d0..211600e2 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 55a313e9..54d904dd 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index a6d34cbb..8829eb44 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 4066b1af..0a8cfac6 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,7 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 1180b42d..607c642f 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 57aa6262..f0af64c8 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 894c9199..c5f351c5 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 32b9727f..5287aa25 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 73b72832..038318ab 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index ad96bbe2..d87c741e 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 2fa037f7..3e2471ae 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index b43b51c4..8272b9b6 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # ik_llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 12767285..98ef23d7 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index a5df3ad4..157ad313 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From d84157403a1c8b65f8597302463e46c28a6659d1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:31:44 -0700
Subject: [PATCH 26/27] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_ik.txt            | 4 ++--
 requirements/portable/requirements_ik_cpu_only.txt   | 4 ++--
 requirements/portable/requirements_ik_cuda131.txt    | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 17 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index f1a953a5..b38ae848 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 211600e2..7fb3a7d9 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 54d904dd..4a0f764c 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 8829eb44..942d5d71 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 0a8cfac6..6b61dca7 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 7173345a..a4d6cc97 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 607c642f..5aff54b2 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index f0af64c8..0771f53e 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index c5f351c5..427d59b2 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 5287aa25..c47a6ca1 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 038318ab..e491e357 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index d87c741e..5870983a 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 3e2471ae..d11d337d 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index 8272b9b6..c2b69e1c 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 98ef23d7..7f280930 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index e38140ce..322056be 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 157ad313..dfd52be5 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 7aab2fdf9aefb0f14fbf58e132a2a9a5850f8319 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:50:42 -0700
Subject: [PATCH 27/27] API: Improve cache clearing in logprobs

---
 modules/api/completions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/api/completions.py b/modules/api/completions.py
index 98bcff47..f2282731 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -89,6 +89,7 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         return [{"token": first_token_str, "null_logprob": True}]
 
     import torch
+    from modules.torch_utils import clear_torch_cache
 
     if hasattr(model, 'get_prompt_logits'):
         logits = model.get_prompt_logits(input_ids)
@@ -143,7 +144,7 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
         del chunk_logits, chunk_lse, chunk_top_values
 
     del logits
-    torch.cuda.empty_cache()
+    clear_torch_cache()
 
     all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0)
     all_top_indices = torch.cat(all_top_indices_list, dim=0)