From 807be1183272fac409ce8f08609dbdd0d9f63362 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 18:48:50 -0700
Subject: [PATCH 01/27] Remove obsolete models/config.yaml and related code
---
docs/01 - Chat Tab.md | 2 +-
docs/12 - OpenAI API.md | 2 +-
modules/models.py | 1 -
modules/models_settings.py | 9 +-
modules/shared.py | 10 --
server.py | 5 -
user_data/models/config.yaml | 203 -----------------------------------
7 files changed, 4 insertions(+), 228 deletions(-)
delete mode 100644 user_data/models/config.yaml
diff --git a/docs/01 - Chat Tab.md b/docs/01 - Chat Tab.md
index 5104895f..96b232fa 100644
--- a/docs/01 - Chat Tab.md
+++ b/docs/01 - Chat Tab.md
@@ -112,7 +112,7 @@ Used for talking to an instruction-following model using the prompt format defin
The prompt format is defined by the **Instruction template** parameter in "Parameters" > "Instruction template", which represents a Jinja2 template.
-Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any), and will update the values under "Parameters" > "Instruction template" accordingly. This is done using a set of regular expressions defined in `user_data/models/config.yaml`. This detection is not guaranteed to be accurate. You should check the model card on Hugging Face to see if you are using the correct prompt format.
+Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any) from the model metadata (e.g. `tokenizer_config.json` or GGUF metadata), and will update the values under "Parameters" > "Instruction template" accordingly. You should check the model card on Hugging Face to see if you are using the correct prompt format.
### Chat-instruct
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 2a7a7f69..0a076c35 100644
--- a/docs/12 - OpenAI API.md
+++ b/docs/12 - OpenAI API.md
@@ -39,7 +39,7 @@ curl http://127.0.0.1:5000/v1/completions \
#### Chat completions
-Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be guessed automatically based on the model name using the regex patterns in `user_data/models/config.yaml`.
+Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be detected automatically from the model metadata.
```shell
curl http://127.0.0.1:5000/v1/chat/completions \
diff --git a/modules/models.py b/modules/models.py
index 1d139b89..b2665c6b 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -67,7 +67,6 @@ def load_model(model_name, loader=None):
logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
logger.info(f"LOADER: \"{loader}\"")
logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
- logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"")
return model, tokenizer
diff --git a/modules/models_settings.py b/modules/models_settings.py
index dcface71..eafa0581 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -23,14 +23,9 @@ def get_fallback_settings():
def get_model_metadata(model):
model_path = resolve_model_path(model)
- model_settings = {}
- # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
- settings = shared.model_config
- for pat in settings:
- if re.match(pat.lower(), Path(model).name.lower()):
- for k in settings[pat]:
- model_settings[k] = settings[pat][k]
+ # Fallback settings
+ model_settings = get_fallback_settings()
path = model_path / 'config.json'
if path.exists():
diff --git a/modules/shared.py b/modules/shared.py
index 16ccbe77..acb103b4 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -454,17 +454,7 @@ def load_user_config():
args.loader = fix_loader_name(args.loader)
-# Load model-specific settings
-p = Path(f'{args.model_dir}/config.yaml')
-if p.exists():
- model_config = yaml.safe_load(open(p, 'r').read())
-else:
- model_config = {}
-del p
-
-
# Load custom model-specific settings
user_config = load_user_config()
-model_config = OrderedDict(model_config)
user_config = OrderedDict(user_config)
diff --git a/server.py b/server.py
index d224909c..88936ca6 100644
--- a/server.py
+++ b/server.py
@@ -18,7 +18,6 @@ import modules.extensions as extensions_module
from modules.LoRA import add_lora_to_model
from modules.models import load_model, unload_model_if_idle
from modules.models_settings import (
- get_fallback_settings,
get_model_metadata,
update_model_parameters
)
@@ -271,10 +270,6 @@ if __name__ == "__main__":
# Apply CLI overrides for image model settings (CLI flags take precedence over saved settings)
shared.apply_image_model_cli_overrides()
- # Fallback settings for models
- shared.model_config['.*'] = get_fallback_settings()
- shared.model_config.move_to_end('.*', last=False) # Move to the beginning
-
# Activate the extensions listed on settings.yaml
extensions_module.available_extensions = utils.get_available_extensions()
for extension in shared.settings['default_extensions']:
diff --git a/user_data/models/config.yaml b/user_data/models/config.yaml
deleted file mode 100644
index 038ebcf1..00000000
--- a/user_data/models/config.yaml
+++ /dev/null
@@ -1,203 +0,0 @@
-.*(llama|alpac|vicuna|guanaco|koala|llava|wizardlm|metharme|pygmalion-7b|pygmalion-2|mythalion|wizard-mega|openbuddy|vigogne|h2ogpt-research|manticore):
- model_type: 'llama'
-.*(opt-|opt_|opt1|opt3|optfor|galactica|galpaca|pygmalion-350m):
- model_type: 'opt'
-.*(gpt-j|gptj|gpt4all-j|malion-6b|pygway|pygmalion-6b|dolly-v1):
- model_type: 'gptj'
-.*(gpt-neox|koalpaca-polyglot|polyglot.*koalpaca|polyglot-ko|polyglot_ko|pythia|stablelm|incite|dolly-v2|polycoder|h2ogpt-oig|h2ogpt-oasst1|h2ogpt-gm):
- model_type: 'gptneox'
-.*bloom:
- model_type: 'bloom'
-.*gpt2:
- model_type: 'gpt2'
-.*falcon:
- model_type: 'falcon'
-.*mpt:
- model_type: 'mpt'
-.*(starcoder|starchat):
- model_type: 'starcoder'
-.*dolly-v2:
- model_type: 'dollyv2'
-.*replit:
- model_type: 'replit'
-.*(oasst|openassistant-|stablelm-7b-sft-v7-epoch-3):
- instruction_template: 'Open Assistant'
- skip_special_tokens: false
-(?!.*galactica)(?!.*reward).*openassistant:
- instruction_template: 'Open Assistant'
- skip_special_tokens: false
-.*galactica:
- skip_special_tokens: false
-.*dolly-v[0-9]-[0-9]*b:
- instruction_template: 'Alpaca'
- skip_special_tokens: false
-.*alpaca-native-4bit:
- instruction_template: 'Alpaca'
-.*llava:
- instruction_template: 'LLaVA'
-.*llava.*1.5:
- instruction_template: 'Vicuna-v1.1'
-.*wizard.*mega:
- instruction_template: 'Wizard-Mega'
-.*starchat-beta:
- instruction_template: 'Starchat-Beta'
-(?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna:
- instruction_template: 'Vicuna-v0'
-.*vicuna.*v0:
- instruction_template: 'Vicuna-v0'
-.*vicuna.*(1.1|1_1|1.3|1_3):
- instruction_template: 'Vicuna-v1.1'
-.*vicuna.*(1.5|1_5):
- instruction_template: 'Vicuna-v1.1'
-.*stable.*vicuna:
- instruction_template: 'StableVicuna'
-(?!.*chat).*chinese-vicuna:
- instruction_template: 'Alpaca'
-.*chinese-vicuna.*chat:
- instruction_template: 'Chinese-Vicuna-Chat'
-.*alpaca:
- instruction_template: 'Alpaca'
-.*koala:
- instruction_template: 'Koala'
-.*chatglm:
- instruction_template: 'ChatGLM'
-.*(metharme|pygmalion|mythalion):
- instruction_template: 'Metharme'
-.*raven:
- instruction_template: 'RWKV-Raven'
-.*moss-moon.*sft:
- instruction_template: 'MOSS'
-.*stablelm-tuned:
- instruction_template: 'StableLM'
-.*galactica.*finetuned:
- instruction_template: 'Galactica Finetuned'
-.*galactica.*-v2:
- instruction_template: 'Galactica v2'
-(?!.*finetuned)(?!.*-v2).*galactica:
- instruction_template: 'Galactica'
-.*guanaco:
- instruction_template: 'Guanaco non-chat'
-.*baize:
- instruction_template: 'Baize'
-.*mpt-.*instruct:
- instruction_template: 'Alpaca'
-.*mpt-.*chat:
- instruction_template: 'ChatML'
-(?!.*-flan-)(?!.*-t5-).*lamini-:
- instruction_template: 'Alpaca'
-.*incite.*chat:
- instruction_template: 'INCITE-Chat'
-.*incite.*instruct:
- instruction_template: 'INCITE-Instruct'
-.*ziya-:
- instruction_template: 'Ziya'
-.*koalpaca:
- instruction_template: 'KoAlpaca'
-.*openbuddy:
- instruction_template: 'OpenBuddy'
-(?!.*chat).*vigogne:
- instruction_template: 'Vigogne-Instruct'
-.*vigogne.*chat:
- instruction_template: 'Vigogne-Chat'
-.*(llama-deus|supercot|llama-natural-instructions|open-llama-0.3t-7b-instruct-dolly-hhrlhf|open-llama-0.3t-7b-open-instruct):
- instruction_template: 'Alpaca'
-.*bactrian:
- instruction_template: 'Bactrian'
-.*(h2ogpt-oig-|h2ogpt-oasst1-|h2ogpt-research-oasst1-):
- instruction_template: 'INCITE-Chat'
-.*h2ogpt-gm-:
- instruction_template: 'H2O-prompt_answer'
-.*manticore:
- instruction_template: 'Manticore Chat'
-.*bluemoonrp-(30|13)b:
- instruction_template: 'Bluemoon'
-.*Nous-Hermes-13b:
- instruction_template: 'Alpaca'
-.*airoboros:
- instruction_template: 'Vicuna-v1.1'
-.*airoboros.*1.2:
- instruction_template: 'Airoboros-v1.2'
-.*alpa(cino|sta):
- instruction_template: 'Alpaca'
-.*hippogriff:
- instruction_template: 'Hippogriff'
-.*lazarus:
- instruction_template: 'Alpaca'
-.*guanaco-.*(7|13|33|65)b:
- instruction_template: 'Vicuna-v0'
-.*hypermantis:
- instruction_template: 'Alpaca'
-.*open-llama-.*-open-instruct:
- instruction_template: 'Alpaca'
-.*starcoder-gpteacher-code-instruct:
- instruction_template: 'Alpaca'
-.*tulu:
- instruction_template: 'Tulu'
-.*chronos:
- instruction_template: 'Alpaca'
-.*samantha:
- instruction_template: 'Samantha'
-.*wizardcoder:
- instruction_template: 'Alpaca'
-.*minotaur:
- instruction_template: 'Manticore Chat'
-.*orca_mini:
- instruction_template: 'Orca Mini'
-.*(platypus|gplatty|superplatty):
- instruction_template: 'Alpaca'
-.*(openorca-platypus2):
- instruction_template: 'OpenOrca-Platypus2'
-.*longchat:
- instruction_template: 'Vicuna-v1.1'
-.*vicuna-33b:
- instruction_template: 'Vicuna-v1.1'
-.*redmond-hermes-coder:
- instruction_template: 'Alpaca'
-.*wizardcoder-15b:
- instruction_template: 'Alpaca'
-.*wizardlm:
- instruction_template: 'Vicuna-v1.1'
-.*godzilla:
- instruction_template: 'Alpaca'
-.*llama(-?)(2|v2).*chat:
- instruction_template: 'Llama-v2'
-.*newhope:
- instruction_template: 'NewHope'
-.*stablebeluga2:
- instruction_template: 'StableBeluga2'
-.*openchat:
- instruction_template: 'OpenChat'
-.*codellama.*instruct:
- instruction_template: 'Llama-v2'
-.*(mistral|mixtral).*instruct:
- instruction_template: 'Mistral'
-.*mistral.*openorca:
- instruction_template: 'ChatML'
-.*(WizardCoder-Python-34B-V1.0|Phind-CodeLlama-34B-v2|CodeBooga-34B-v0.1):
- instruction_template: 'Alpaca'
-.*orca-2-(13|7)b:
- instruction_template: 'ChatML'
-.*openhermes.*mistral:
- instruction_template: 'ChatML'
-.*Yi-34B-Chat:
- instruction_template: 'ChatML'
-(dolphin).*:
- instruction_template: 'ChatML'
-.*synthia:
- instruction_template: 'Synthia'
-.*(hercules|hyperion):
- instruction_template: 'ChatML'
-.*command-r:
- instruction_template: 'Command-R'
-.*xwin-lm-70b-v0.1:
- instruction_template: 'Vicuna-v1.1'
-.*platypus-yi-34b:
- instruction_template: 'Vicuna-v1.1'
-.*CausalLM-RP-34B:
- instruction_template: 'ChatML'
-34b-beta:
- instruction_template: 'ChatML'
-.*airoboros-3_1-yi-34b-200k:
- instruction_template: 'Llama-v2'
-.*chatqa:
- instruction_template: 'NVIDIA-ChatQA'
From d6f1485dd189494f6fbe5b6ea7ebd5cc0404233a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 21:45:11 -0700
Subject: [PATCH 02/27] UI: Update the enable_thinking info message
---
modules/ui_chat.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index f1dc7883..10d05f65 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -82,7 +82,7 @@ def create_ui():
gr.HTML("
")
shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
- shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.')
+ shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='For models with thinking support.')
gr.HTML("")
From 368f37335f634ba001d00d2841902de85c7b48db Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 25 Mar 2026 06:37:45 -0700
Subject: [PATCH 03/27] Fix --idle-timeout issues with encode/decode and
parallel generation
---
modules/logits.py | 4 +---
modules/models.py | 15 ++++++++++++++-
modules/text_generation.py | 18 +++++++++++++-----
3 files changed, 28 insertions(+), 9 deletions(-)
diff --git a/modules/logits.py b/modules/logits.py
index 1f878f27..473f5890 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -4,7 +4,6 @@ import numpy as np
from modules import models, shared
from modules.logging_colors import logger
-from modules.models import load_model
from modules.text_generation import generate_reply
from modules.utils import check_model_loaded
@@ -12,8 +11,7 @@ global_scores = None
def get_next_logits(*args, **kwargs):
- if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
- shared.model, shared.tokenizer = load_model(shared.model_name)
+ models.load_model_if_idle_unloaded()
needs_lock = not args[2] # use_samplers
if needs_lock:
diff --git a/modules/models.py b/modules/models.py
index b2665c6b..61ca3838 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,4 +1,5 @@
import sys
+import threading
import time
import modules.shared as shared
@@ -7,6 +8,15 @@ from modules.models_settings import get_model_metadata
from modules.utils import resolve_model_path
last_generation_time = time.time()
+active_generation_count = 0
+_generation_count_lock = threading.Lock()
+
+
+def load_model_if_idle_unloaded():
+ global last_generation_time
+ if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
+ shared.model, shared.tokenizer = load_model(shared.model_name)
+ last_generation_time = time.time()
def load_model(model_name, loader=None):
@@ -158,7 +168,10 @@ def unload_model_if_idle():
while True:
shared.generation_lock.acquire()
try:
- if time.time() - last_generation_time > shared.args.idle_timeout * 60:
+ with _generation_count_lock:
+ is_active = active_generation_count > 0
+
+ if not is_active and time.time() - last_generation_time > shared.args.idle_timeout * 60:
if shared.model is not None:
logger.info("Unloading the model for inactivity.")
unload_model(keep_model_name=True)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f77be124..3a9ddab5 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -17,9 +17,7 @@ from modules.utils import check_model_loaded
def generate_reply(*args, **kwargs):
- if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
- from modules.models import load_model
- shared.model, shared.tokenizer = load_model(shared.model_name)
+ models.load_model_if_idle_unloaded()
state = args[1] if len(args) > 1 else kwargs.get('state', {})
use_parallel = (
@@ -31,10 +29,16 @@ def generate_reply(*args, **kwargs):
if not use_parallel:
shared.generation_lock.acquire()
+ with models._generation_count_lock:
+ models.active_generation_count += 1
+
try:
for result in _generate_reply(*args, **kwargs):
yield result
finally:
+ with models._generation_count_lock:
+ models.active_generation_count -= 1
+
models.last_generation_time = time.time()
if not use_parallel:
shared.generation_lock.release()
@@ -126,7 +130,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
if shared.tokenizer is None:
- raise ValueError('No tokenizer is loaded')
+ models.load_model_if_idle_unloaded()
+ if shared.tokenizer is None:
+ raise ValueError('No tokenizer is loaded')
# llama.cpp case
if shared.model.__class__.__name__ == 'LlamaServer':
@@ -176,7 +182,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
def decode(output_ids, skip_special_tokens=True):
if shared.tokenizer is None:
- raise ValueError('No tokenizer is loaded')
+ models.load_model_if_idle_unloaded()
+ if shared.tokenizer is None:
+ raise ValueError('No tokenizer is loaded')
return shared.tokenizer.decode(output_ids, skip_special_tokens=skip_special_tokens)
From e1541400219043f9b9cebf5f002b48251efc8bf9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 25 Mar 2026 07:21:02 -0700
Subject: [PATCH 04/27] Rename "truncation length" to "context length" in logs
---
modules/api/models.py | 2 +-
modules/models.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/modules/api/models.py b/modules/api/models.py
index c879a860..b89397d3 100644
--- a/modules/api/models.py
+++ b/modules/api/models.py
@@ -68,7 +68,7 @@ def _load_model(data):
if k in shared.settings:
shared.settings[k] = settings[k]
if k == 'truncation_length':
- logger.info(f"TRUNCATION LENGTH (UPDATED): {shared.settings['truncation_length']}")
+ logger.info(f"CONTEXT LENGTH (UPDATED): {shared.settings['truncation_length']}")
elif k == 'instruction_template':
logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}")
diff --git a/modules/models.py b/modules/models.py
index 61ca3838..e997d2d8 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -76,7 +76,7 @@ def load_model(model_name, loader=None):
logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
logger.info(f"LOADER: \"{loader}\"")
- logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
+ logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}")
return model, tokenizer
From 4cbea02ed4e0dee2efd066ac48bcdf33631b9eca Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 26 Mar 2026 06:49:39 -0700
Subject: [PATCH 05/27] Add ik_llama.cpp support via `--ik` flag
---
modules/llama_cpp_server.py | 37 +++++++++++++++++++++++++++++++++++++
modules/shared.py | 1 +
2 files changed, 38 insertions(+)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2ae01ddc..9b9756a9 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -470,6 +470,10 @@ class LlamaServer:
else:
cmd.append(f"--{flag_item}")
+ # Patch flags for ik_llama.cpp compatibility
+ if shared.args.ik:
+ cmd = _patch_cmd_for_ik(cmd)
+
env = os.environ.copy()
if os.name == 'posix':
current_path = env.get('LD_LIBRARY_PATH', '')
@@ -607,3 +611,36 @@ def filter_stderr_with_progress(process_stderr):
process_stderr.close()
except Exception:
pass
+
+
+def _patch_cmd_for_ik(cmd):
+ """
+ Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
+ --no-webui → --webui none
+ --fit off → (removed)
+ --fit on / --fit-ctx → --fit (bare flag)
+ --fit-target → --fit-margin
+ """
+ patched = []
+ i = 0
+ while i < len(cmd):
+ arg = cmd[i]
+
+ if arg == "--no-webui":
+ patched += ["--webui", "none"]
+ elif arg == "--fit" and i + 1 < len(cmd) and cmd[i + 1] in ("on", "off"):
+ val = cmd[i + 1]
+ i += 1
+ if val == "on":
+ patched.append("--fit")
+ # "off" → drop entirely
+ elif arg == "--fit-ctx":
+ i += 1 # skip the value
+ elif arg == "--fit-target":
+ patched.append("--fit-margin")
+ else:
+ patched.append(arg)
+
+ i += 1
+
+ return patched
diff --git a/modules/shared.py b/modules/shared.py
index acb103b4..c50736d7 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -110,6 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside /lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.')
# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
From bda95172bd6abecba165fc118f140cfc446f3c42 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 06:09:53 -0700
Subject: [PATCH 06/27] Fix stopping string detection for chromadb/context-1
---
modules/chat.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/modules/chat.py b/modules/chat.py
index f8088e0f..edda11b0 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -671,7 +671,10 @@ def get_stopping_strings(state):
# Handle GPT-OSS as a special case
if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
result.remove("<|end|>")
- result.append("<|result|>")
+ if '<|result|>' in state['instruction_template_str']:
+ result.append("<|result|>")
+ elif '<|return|>' in state['instruction_template_str']:
+ result.append("<|return|>")
result = list(set(result))
if shared.args.verbose:
From 9dd04b86ce407507bcaf0862b97aadc64b6e62a6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 06:17:57 -0700
Subject: [PATCH 07/27] Suppress EOS token at logit level for ExLlamav3 when
ban_eos_token is set
---
modules/exllamav3.py | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 75c76c7c..f873503a 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -423,6 +423,15 @@ class Exllamav3Model:
if logit_bias:
filters.append(LogitBiasFilter(self.tokenizer, logit_bias))
+ # Suppress EOS tokens via logit bias so they are never sampled
+ if state['ban_eos_token']:
+ eos_bias = {}
+ for eos_id in self.config.eos_token_id_list:
+ if eos_id is not None:
+ eos_bias[str(eos_id)] = float('-inf')
+ if eos_bias:
+ filters.append(LogitBiasFilter(self.tokenizer, eos_bias))
+
# Logprobs support (OpenAI API)
logprobs = state.get('logprobs', 0) or 0
return_top_tokens = logprobs if logprobs > 0 else 0
From 4979e87e48c78d5e3186e4d9b2fbc8b30e86164f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 11:49:47 -0300
Subject: [PATCH 08/27] Add ik_llama.cpp support via ik_llama_cpp_binaries
package
---
.github/workflows/build-everything-tgw.yml | 35 +++
.../build-portable-release-ik-cuda.yml | 179 +++++++++++++++
.../workflows/build-portable-release-ik.yml | 205 ++++++++++++++++++
modules/llama_cpp_server.py | 21 +-
modules/loaders.py | 2 +
modules/shared.py | 2 +-
modules/ui_model_menu.py | 3 +
requirements/full/requirements.txt | 6 +-
requirements/full/requirements_amd.txt | 4 +-
.../full/requirements_apple_intel.txt | 3 +-
.../full/requirements_apple_silicon.txt | 3 +-
requirements/full/requirements_cpu_only.txt | 6 +-
requirements/portable/requirements.txt | 4 +-
requirements/portable/requirements_amd.txt | 4 +-
.../portable/requirements_apple_intel.txt | 2 +-
.../portable/requirements_apple_silicon.txt | 2 +-
.../portable/requirements_cpu_only.txt | 4 +-
.../portable/requirements_cuda131.txt | 4 +-
requirements/portable/requirements_vulkan.txt | 4 +-
19 files changed, 469 insertions(+), 24 deletions(-)
create mode 100644 .github/workflows/build-portable-release-ik-cuda.yml
create mode 100644 .github/workflows/build-portable-release-ik.yml
diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 9322f859..4de591f4 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -68,3 +68,38 @@ jobs:
with:
version: ${{ inputs.version }}
config: 'os:macos-15-intel,macos-14'
+
+ build_release_ik_cuda_windows:
+ name: ik CUDA Windows
+ uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+ with:
+ version: ${{ inputs.version }}
+ config: 'os:windows-2022'
+
+ build_release_ik_cuda_linux:
+ name: ik CUDA Linux
+ uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+ with:
+ version: ${{ inputs.version }}
+ config: 'os:ubuntu-22.04'
+
+ build_release_ik_cpu_windows:
+ name: ik CPU Windows
+ uses: ./.github/workflows/build-portable-release-ik.yml
+ with:
+ version: ${{ inputs.version }}
+ config: 'os:windows-2022'
+
+ build_release_ik_cpu_linux:
+ name: ik CPU Linux
+ uses: ./.github/workflows/build-portable-release-ik.yml
+ with:
+ version: ${{ inputs.version }}
+ config: 'os:ubuntu-22.04'
+
+ build_release_ik_macos:
+ name: ik macOS
+ uses: ./.github/workflows/build-portable-release-ik.yml
+ with:
+ version: ${{ inputs.version }}
+ config: 'os:macos-14'
diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml
new file mode 100644
index 00000000..40b4b92f
--- /dev/null
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@@ -0,0 +1,179 @@
+name: Build ik CUDA
+
+on:
+ workflow_dispatch:
+ inputs:
+ version:
+ description: 'Version tag of text-generation-webui to build: v3.0'
+ default: 'v3.0'
+ required: true
+ type: string
+ config:
+ description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+ default: 'Default'
+ required: false
+ type: string
+ exclude:
+ description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+ default: 'None'
+ required: false
+ type: string
+ workflow_call:
+ inputs:
+ version:
+ description: 'Version tag of text-generation-webui to build: v3.0'
+ default: 'v3.0'
+ required: true
+ type: string
+ config:
+ description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+ default: 'Default'
+ required: false
+ type: string
+ exclude:
+ description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+ default: 'None'
+ required: false
+ type: string
+
+permissions:
+ contents: write
+
+jobs:
+ define_matrix:
+ name: Define Build Matrix
+ runs-on: ubuntu-latest
+ outputs:
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
+ defaults:
+ run:
+ shell: pwsh
+ env:
+ CONFIGIN: ${{ inputs.config }}
+ EXCLUDEIN: ${{ inputs.exclude }}
+
+ steps:
+ - name: Define Job Output
+ id: set-matrix
+ run: |
+ $matrix = @{
+ 'os' = @('ubuntu-22.04', 'windows-2022')
+ 'pyver' = @("3.13")
+ 'cuda' = @("12.4", "13.1")
+ }
+
+ if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+ if ($env:EXCLUDEIN -ne 'None') {
+ $exclusions = @()
+ $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+ $matrix['exclude'] = $exclusions
+ }
+
+ $matrixOut = ConvertTo-Json $matrix -Compress
+ Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+ build_wheels:
+ name: ${{ matrix.os }} ${{ matrix.pyver }} CUDA ${{ matrix.cuda }}
+ needs: define_matrix
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+ defaults:
+ run:
+ shell: pwsh
+ env:
+ PCKGVER: ${{ inputs.version }}
+
+ steps:
+ - uses: actions/checkout@v6
+ with:
+ repository: 'oobabooga/text-generation-webui'
+ ref: ${{ inputs.version }}
+ submodules: 'recursive'
+
+ - uses: actions/setup-python@v6
+ with:
+ python-version: ${{ matrix.pyver }}
+
+ - name: Build Package
+ shell: bash
+ run: |
+ VERSION_CLEAN="${{ inputs.version }}"
+ VERSION_CLEAN="${VERSION_CLEAN#v}"
+ cd ..
+ cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+ cd "text-generation-webui-${VERSION_CLEAN}"
+
+ # Remove extensions that need additional requirements
+ allowed=("character_bias" "gallery" "sd_api_pictures")
+ find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+ # Define common variables
+ CUDA_VERSION="${{ matrix.cuda }}"
+ VERSION="${{ inputs.version }}"
+
+ # 1. Set platform-specific variables
+ if [[ "$RUNNER_OS" == "Windows" ]]; then
+ PLATFORM="windows"
+ PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+ PIP_PATH="portable_env/python.exe -m pip"
+ PACKAGES_PATH="portable_env/Lib/site-packages"
+ rm start_linux.sh start_macos.sh
+ else
+ PLATFORM="linux"
+ PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+ PIP_PATH="portable_env/bin/python -m pip"
+ PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+ rm start_macos.sh start_windows.bat
+ fi
+
+ # 2. Download and extract Python
+ cd ..
+ echo "Downloading Python for $PLATFORM..."
+ curl -L -o python-build.tar.gz "$PYTHON_URL"
+ tar -xzf python-build.tar.gz
+ mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+ # 3. Prepare requirements file based on CUDA version
+ cd "text-generation-webui-${VERSION_CLEAN}"
+ if [[ "$CUDA_VERSION" == "13.1" ]]; then
+ REQ_FILE="requirements/portable/requirements_cuda131.txt"
+ else
+ REQ_FILE="requirements/portable/requirements.txt"
+ fi
+
+ # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
+ sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+ sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true
+
+ # 5. Install packages
+ echo "Installing Python packages from $REQ_FILE..."
+ $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+ # 6. Clean up
+ rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+ # 7. Create archive
+ cd ..
+ if [[ "$RUNNER_OS" == "Windows" ]]; then
+ ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
+ echo "Creating archive: $ARCHIVE_NAME"
+ powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+ else
+ ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.tar.gz"
+ echo "Creating archive: $ARCHIVE_NAME"
+ tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+ fi
+
+ - name: Upload files to a GitHub release
+ id: upload-release
+ uses: svenstaro/upload-release-action@2.7.0
+ continue-on-error: true
+ with:
+ repo_token: ${{ secrets.GITHUB_TOKEN }}
+ file: ../textgen-portable-ik-*
+ tag: ${{ inputs.version }}
+ file_glob: true
+ make_latest: false
+ overwrite: true
diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml
new file mode 100644
index 00000000..afb2e763
--- /dev/null
+++ b/.github/workflows/build-portable-release-ik.yml
@@ -0,0 +1,205 @@
+name: Build ik CPU and macOS
+
+on:
+ workflow_dispatch:
+ inputs:
+ version:
+ description: 'Version tag of text-generation-webui to build: v3.0'
+ default: 'v3.0'
+ required: true
+ type: string
+ config:
+ description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+ default: 'Default'
+ required: false
+ type: string
+ exclude:
+ description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+ default: 'None'
+ required: false
+ type: string
+ workflow_call:
+ inputs:
+ version:
+ description: 'Version tag of text-generation-webui to build: v3.0'
+ default: 'v3.0'
+ required: true
+ type: string
+ config:
+ description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+ default: 'Default'
+ required: false
+ type: string
+ exclude:
+ description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+ default: 'None'
+ required: false
+ type: string
+
+permissions:
+ contents: write
+
+jobs:
+ define_matrix:
+ name: Define Build Matrix
+ runs-on: ubuntu-latest
+ outputs:
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
+ defaults:
+ run:
+ shell: pwsh
+ env:
+ CONFIGIN: ${{ inputs.config }}
+ EXCLUDEIN: ${{ inputs.exclude }}
+
+ steps:
+ - name: Define Job Output
+ id: set-matrix
+ run: |
+ $matrix = @{
+ 'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
+ 'pyver' = @("3.13")
+ }
+
+ if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+ if ($env:EXCLUDEIN -ne 'None') {
+ $exclusions = @()
+ $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+ $matrix['exclude'] = $exclusions
+ }
+
+ $matrixOut = ConvertTo-Json $matrix -Compress
+ Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+ build_wheels:
+ name: ${{ matrix.os }} ${{ matrix.pyver }}
+ needs: define_matrix
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+ defaults:
+ run:
+ shell: pwsh
+ env:
+ PCKGVER: ${{ inputs.version }}
+
+ steps:
+ - uses: actions/checkout@v6
+ with:
+ repository: 'oobabooga/text-generation-webui'
+ ref: ${{ inputs.version }}
+ submodules: 'recursive'
+
+ - uses: actions/setup-python@v6
+ with:
+ python-version: ${{ matrix.pyver }}
+
+ - name: Build Package
+ shell: bash
+ run: |
+ VERSION_CLEAN="${{ inputs.version }}"
+ VERSION_CLEAN="${VERSION_CLEAN#v}"
+ cd ..
+ cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+ cd "text-generation-webui-${VERSION_CLEAN}"
+
+ # Remove extensions that need additional requirements
+ allowed=("character_bias" "gallery" "sd_api_pictures")
+ find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+ # Define common variables
+ VERSION="${{ inputs.version }}"
+ OS_TYPE="${{ matrix.os }}"
+
+ # 1. Set platform-specific variables
+ if [[ "$RUNNER_OS" == "Windows" ]]; then
+ PLATFORM="windows-cpu"
+ PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+ PIP_PATH="portable_env/python.exe -m pip"
+ PACKAGES_PATH="portable_env/Lib/site-packages"
+ rm start_linux.sh start_macos.sh
+ elif [[ "$RUNNER_OS" == "macOS" ]]; then
+ if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
+ PLATFORM="macos-x86_64"
+ PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz"
+ REQ_TYPE="apple_intel"
+ else
+ PLATFORM="macos-arm64"
+ PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz"
+ REQ_TYPE="apple_silicon"
+ fi
+ PIP_PATH="portable_env/bin/python -m pip"
+ PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+ rm start_linux.sh start_windows.bat
+ else
+ # Linux case
+ PLATFORM="linux-cpu"
+ PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+ PIP_PATH="portable_env/bin/python -m pip"
+ PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+ rm start_macos.sh start_windows.bat
+ fi
+
+ # 2. Download and extract Python
+ echo "Downloading Python for $PLATFORM..."
+ cd ..
+ curl -L -o python-build.tar.gz "$PYTHON_URL"
+ tar -xzf python-build.tar.gz
+ mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+ # 3. Prepare requirements file based on platform
+ cd "text-generation-webui-${VERSION_CLEAN}"
+
+ # Select requirements file based on platform
+ if [[ "$RUNNER_OS" == "macOS" ]]; then
+ if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
+ REQ_FILE="requirements/portable/requirements_apple_intel.txt"
+ else
+ REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
+ fi
+ else
+ REQ_FILE="requirements/portable/requirements_cpu_only.txt"
+ fi
+
+ echo "Using requirements file: $REQ_FILE"
+
+ # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
+ if [[ "$RUNNER_OS" == "macOS" ]]; then
+ sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+ sed -i '' 's/--portable/--portable --ik/g' start_macos.sh
+ else
+ sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+ sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
+ fi
+
+ # 5. Install packages
+ echo "Installing Python packages from $REQ_FILE..."
+ $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+ # 6. Clean up
+ rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+ # 7. Create archive
+ cd ..
+ if [[ "$RUNNER_OS" == "Windows" ]]; then
+ ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.zip"
+ echo "Creating archive: $ARCHIVE_NAME"
+ powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+ else
+ ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.tar.gz"
+ echo "Creating archive: $ARCHIVE_NAME"
+ tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+ fi
+
+ - name: Upload files to a GitHub release
+ id: upload-release
+ uses: svenstaro/upload-release-action@2.7.0
+ continue-on-error: true
+ with:
+ repo_token: ${{ secrets.GITHUB_TOKEN }}
+ file: ../textgen-portable-ik-*
+ tag: ${{ inputs.version }}
+ file_glob: true
+ make_latest: false
+ overwrite: true
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 9b9756a9..5e2decfa 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -11,7 +11,6 @@ import time
from pathlib import Path
from typing import Any, List
-import llama_cpp_binaries
import requests
from modules import shared
@@ -357,7 +356,16 @@ class LlamaServer:
"""Start the llama.cpp server and wait until it's ready."""
# Determine the server path
if self.server_path is None:
- self.server_path = llama_cpp_binaries.get_binary_path()
+ if shared.args.ik:
+ try:
+ import ik_llama_cpp_binaries
+ except ImportError:
+ raise ImportError("--ik requires the ik_llama_cpp_binaries package. Install it with: pip install ")
+
+ self.server_path = ik_llama_cpp_binaries.get_binary_path()
+ else:
+ import llama_cpp_binaries
+ self.server_path = llama_cpp_binaries.get_binary_path()
# Build the command
cmd = [
@@ -616,10 +624,12 @@ def filter_stderr_with_progress(process_stderr):
def _patch_cmd_for_ik(cmd):
"""
Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
- --no-webui → --webui none
+ --no-webui → --webui none
--fit off → (removed)
--fit on / --fit-ctx → --fit (bare flag)
--fit-target → --fit-margin
+ --cache-reuse → (removed, unsupported)
+ --swa-full → (removed, unsupported)
"""
patched = []
i = 0
@@ -635,9 +645,14 @@ def _patch_cmd_for_ik(cmd):
patched.append("--fit")
# "off" → drop entirely
elif arg == "--fit-ctx":
+ patched.append("--fit")
i += 1 # skip the value
elif arg == "--fit-target":
patched.append("--fit-margin")
+ elif arg == "--cache-reuse":
+ i += 1 # skip the value
+ elif arg == "--swa-full":
+ pass # bare flag, just drop it
else:
patched.append(arg)
diff --git a/modules/loaders.py b/modules/loaders.py
index c90f2ebb..cb1f3d3b 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -20,6 +20,7 @@ loaders_and_params = OrderedDict({
'no_mmap',
'mlock',
'numa',
+ 'ik',
'parallel',
'model_draft',
'draft_max',
@@ -345,6 +346,7 @@ def list_model_elements():
'spec_ngram_size_m',
'spec_ngram_min_hits',
'mmproj',
+ 'ik',
]
diff --git a/modules/shared.py b/modules/shared.py
index c50736d7..13843f0c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -110,7 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
-group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside /lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. Requires the ik_llama_cpp_binaries package to be installed.')
# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 5b7621a7..16505afa 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -51,6 +51,9 @@ def create_ui():
with gr.Column():
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
+ if not shared.args.portable:
+ shared.gradio['ik'] = gr.Checkbox(label="ik", value=shared.args.ik, info='Use ik_llama.cpp instead of upstream llama.cpp.')
+
shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 56619627..100c99d1 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,10 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 620683cc..66fa4ac7 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b1f109b2..98dc8be6 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index a54476a9..e33264cf 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index be82c904..cd083f6d 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,5 +37,7 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 188da380..67182225 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 4562b6d0..5f5b2f8d 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 04dcf25e..f5f7d6ee 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 4b8af78a..e51fc296 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 5b0eaf89..683f94c8 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 90b3234f..942d0877 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index ea72b4ec..ae784e00 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From be6fc0663ac1b7a60b7fde24afb38de2b0aba57b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 08:11:28 -0700
Subject: [PATCH 09/27] Update the custom gradio wheels
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 4 ++--
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_nowheels.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_amd.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 4 ++--
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cuda131.txt | 4 ++--
requirements/portable/requirements_nowheels.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
14 files changed, 28 insertions(+), 28 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 100c99d1..6e11dd2f 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 66fa4ac7..c964eff6 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 98dc8be6..b1dd6a4f 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index e33264cf..4d03d280 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index cd083f6d..9d41d069 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 77c254e6..052085cc 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 67182225..ff80b6c8 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 5f5b2f8d..318044da 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index f5f7d6ee..1676bffb 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index e51fc296..27fc2da8 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 683f94c8..0bbdd30a 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 942d0877..c3ae3c57 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index e8457909..e38140ce 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index ae784e00..e646c04c 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
# API
flask_cloudflared==0.0.15
From 0466b6e2714a05c04eff0c929f15e4679f029e8d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 29 Mar 2026 15:52:36 -0700
Subject: [PATCH 10/27] ik_llama.cpp: Auto-enable Hadamard KV cache rotation
with quantized cache
---
modules/llama_cpp_server.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 5e2decfa..fa968be1 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -631,6 +631,12 @@ def _patch_cmd_for_ik(cmd):
--cache-reuse → (removed, unsupported)
--swa-full → (removed, unsupported)
"""
+ # Add Hadamard KV cache rotation when using quantized cache types.
+ # This significantly improves quantized cache quality (especially q4_0)
+ # and is a no-op for MLA models like DeepSeek.
+ if shared.args.cache_type in ("q8_0", "q4_0"):
+ cmd += ["-khad", "-vhad"]
+
patched = []
i = 0
while i < len(cmd):
From 6382fbef8381bf60ff909b4fd76e7c1f4c063afc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 30 Mar 2026 17:44:19 -0700
Subject: [PATCH 11/27] Several small code simplifications
---
download-model.py | 25 +++---
js/dark_theme.js | 12 ++-
js/global_scope_js.js | 79 +++++++++---------
js/main.js | 171 +++++++++++++--------------------------
js/save_files.js | 18 ++---
js/show_controls.js | 21 ++---
js/switch_tabs.js | 24 ++----
js/update_big_picture.js | 3 +-
modules/extensions.py | 22 +++--
9 files changed, 140 insertions(+), 235 deletions(-)
diff --git a/download-model.py b/download-model.py
index 95d25e16..a31bbfc6 100644
--- a/download-model.py
+++ b/download-model.py
@@ -158,28 +158,21 @@ class ModelDownloader:
# Also if GGUF and safetensors are available, download only safetensors
if (has_pytorch or has_pt or has_gguf) and has_safetensors:
has_gguf = False
- for i in range(len(classifications) - 1, -1, -1):
- if classifications[i] in ['pytorch', 'pt', 'gguf']:
- links.pop(i)
- file_sizes.pop(i)
+ keep = [i for i, c in enumerate(classifications) if c not in ['pytorch', 'pt', 'gguf']]
+ links = [links[i] for i in keep]
+ file_sizes = [file_sizes[i] for i in keep]
# For GGUF, try to download only the Q4_K_M if no specific file is specified.
if has_gguf and specific_file is None:
- has_q4km = False
- for i in range(len(classifications) - 1, -1, -1):
- if 'q4_k_m' in links[i].lower():
- has_q4km = True
+ has_q4km = any('q4_k_m' in link.lower() for link in links)
if has_q4km:
- for i in range(len(classifications) - 1, -1, -1):
- if 'q4_k_m' not in links[i].lower():
- links.pop(i)
- file_sizes.pop(i)
+ keep = [i for i, link in enumerate(links) if 'q4_k_m' in link.lower()]
else:
- for i in range(len(classifications) - 1, -1, -1):
- if links[i].lower().endswith('.gguf'):
- links.pop(i)
- file_sizes.pop(i)
+ keep = [i for i, link in enumerate(links) if not link.lower().endswith('.gguf')]
+
+ links = [links[i] for i in keep]
+ file_sizes = [file_sizes[i] for i in keep]
is_llamacpp = has_gguf and specific_file is not None
return links, sha256, is_lora, is_llamacpp, file_sizes
diff --git a/js/dark_theme.js b/js/dark_theme.js
index 7136f5bf..9d7069e2 100644
--- a/js/dark_theme.js
+++ b/js/dark_theme.js
@@ -1,6 +1,6 @@
function toggleDarkMode() {
document.body.classList.toggle("dark");
- var currentCSS = document.getElementById("highlight-css");
+ const currentCSS = document.getElementById("highlight-css");
if (currentCSS.getAttribute("href") === "file/css/highlightjs/github-dark.min.css") {
currentCSS.setAttribute("href", "file/css/highlightjs/github.min.css");
} else {
@@ -9,12 +9,10 @@ function toggleDarkMode() {
// Re-highlight all code blocks once stylesheet loads
currentCSS.onload = function() {
- const messageBodies = document.getElementById("chat").querySelectorAll(".message-body");
- messageBodies.forEach((messageBody) => {
- const codeBlocks = messageBody.querySelectorAll("pre code");
- codeBlocks.forEach((codeBlock) => {
- hljs.highlightElement(codeBlock);
- });
+ // Clear data-highlighted so hljs will re-process with the new theme
+ document.querySelectorAll("#chat .message-body pre code[data-highlighted]").forEach((codeBlock) => {
+ delete codeBlock.dataset.highlighted;
});
+ doSyntaxHighlighting();
};
}
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 92f65622..20eeef66 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -1,11 +1,35 @@
+// -------------------------------------------------
+// Shared helpers
+// -------------------------------------------------
+
+function getProfilePictureUrl() {
+ return "/file/user_data/cache/pfp_character.png?time=" + Date.now();
+}
+
+const MESSAGE_SELECTOR = ".message, .user-message, .assistant-message";
+
+function getMessageElement(element) {
+ if (!element) return null;
+ return element.closest(MESSAGE_SELECTOR);
+}
+
+function isUserRole(messageElement) {
+ return messageElement.classList.contains("user-message") ||
+ messageElement.querySelector(".text-you") !== null ||
+ messageElement.querySelector(".circle-you") !== null;
+}
+
+// Trigger a synthetic 'input' event so Gradio picks up programmatic value changes
+function dispatchGradioInput(element) {
+ element.dispatchEvent(new Event("input", { bubbles: true }));
+}
+
// -------------------------------------------------
// Event handlers
// -------------------------------------------------
function copyToClipboard(element) {
- if (!element) return;
-
- const messageElement = element.closest(".message, .user-message, .assistant-message");
+ const messageElement = getMessageElement(element);
if (!messageElement) return;
const rawText = messageElement.getAttribute("data-raw");
@@ -48,9 +72,7 @@ function fallbackCopyToClipboard(text) {
}
function branchHere(element) {
- if (!element) return;
-
- const messageElement = element.closest(".message, .user-message, .assistant-message");
+ const messageElement = getMessageElement(element);
if (!messageElement) return;
const index = messageElement.getAttribute("data-index");
@@ -69,11 +91,7 @@ function branchHere(element) {
}
branchIndexInput.value = index;
-
- // Trigger any 'change' or 'input' events Gradio might be listening for
- const event = new Event("input", { bubbles: true });
- branchIndexInput.dispatchEvent(event);
-
+ dispatchGradioInput(branchIndexInput);
branchButton.click();
}
@@ -82,9 +100,7 @@ function branchHere(element) {
// -------------------------------------------------
function editHere(buttonElement) {
- if (!buttonElement) return;
-
- const messageElement = buttonElement.closest(".message, .user-message, .assistant-message");
+ const messageElement = getMessageElement(buttonElement);
if (!messageElement) return;
const messageBody = messageElement.querySelector(".message-body");
@@ -97,12 +113,7 @@ function editHere(buttonElement) {
return;
}
- // Determine role based on message element - handle different chat modes
- const isUserMessage = messageElement.classList.contains("user-message") ||
- messageElement.querySelector(".text-you") !== null ||
- messageElement.querySelector(".circle-you") !== null;
-
- startEditing(messageElement, messageBody, isUserMessage);
+ startEditing(messageElement, messageBody, isUserRole(messageElement));
}
function startEditing(messageElement, messageBody, isUserMessage) {
@@ -209,30 +220,22 @@ function submitMessageEdit(index, newText, isUserMessage) {
editTextInput.value = newText;
editRoleInput.value = isUserMessage ? "user" : "assistant";
- editIndexInput.dispatchEvent(new Event("input", { bubbles: true }));
- editTextInput.dispatchEvent(new Event("input", { bubbles: true }));
- editRoleInput.dispatchEvent(new Event("input", { bubbles: true }));
+ dispatchGradioInput(editIndexInput);
+ dispatchGradioInput(editTextInput);
+ dispatchGradioInput(editRoleInput);
editButton.click();
return true;
}
function navigateVersion(element, direction) {
- if (!element) return;
-
- const messageElement = element.closest(".message, .user-message, .assistant-message");
+ const messageElement = getMessageElement(element);
if (!messageElement) return;
const index = messageElement.getAttribute("data-index");
if (!index) return;
- // Determine role based on message element classes
- let role = "assistant"; // Default role
- if (messageElement.classList.contains("user-message") ||
- messageElement.querySelector(".text-you") ||
- messageElement.querySelector(".circle-you")) {
- role = "user";
- }
+ const role = isUserRole(messageElement) ? "user" : "assistant";
const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
@@ -248,11 +251,9 @@ function navigateVersion(element, direction) {
directionInput.value = direction;
roleInput.value = role;
- // Trigger 'input' events for Gradio to pick up changes
- const event = new Event("input", { bubbles: true });
- indexInput.dispatchEvent(event);
- directionInput.dispatchEvent(event);
- roleInput.dispatchEvent(event);
+ dispatchGradioInput(indexInput);
+ dispatchGradioInput(directionInput);
+ dispatchGradioInput(roleInput);
navigateButton.click();
}
@@ -313,7 +314,7 @@ function handleMorphdomUpdate(data) {
function applyMorphdomUpdate(data) {
// Determine target element and use it as query scope
- var target_element, target_html;
+ let target_element, target_html;
if (data.last_message_only) {
const childNodes = document.getElementsByClassName("messages")[0].childNodes;
target_element = childNodes[childNodes.length - 1];
diff --git a/js/main.js b/js/main.js
index f05f93c6..cba4c903 100644
--- a/js/main.js
+++ b/js/main.js
@@ -4,8 +4,9 @@
// Sync highlight.js theme with the actual Gradio theme
var defined_hljs_css = document.body.classList.contains("dark") ? "file/css/highlightjs/github-dark.min.css" : "file/css/highlightjs/github.min.css";
-if (document.getElementById("highlight-css").getAttribute("href") !== defined_hljs_css) {
- document.getElementById("highlight-css").setAttribute("href", defined_hljs_css);
+var hljsCssElement = document.getElementById("highlight-css");
+if (hljsCssElement.getAttribute("href") !== defined_hljs_css) {
+ hljsCssElement.setAttribute("href", defined_hljs_css);
}
let main_parent = document.getElementById("chat-tab").parentNode;
@@ -49,21 +50,18 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
//------------------------------------------------
// --- Helper functions --- //
-function isModifiedKeyboardEvent() {
- return (event instanceof KeyboardEvent &&
- event.shiftKey ||
- event.ctrlKey ||
- event.altKey ||
- event.metaKey);
+function isModifiedKeyboardEvent(event) {
+ return event instanceof KeyboardEvent &&
+ (event.shiftKey || event.ctrlKey || event.altKey || event.metaKey);
}
-function isFocusedOnEditableTextbox() {
+function isFocusedOnEditableTextbox(event) {
if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") {
return !!event.target.value;
}
+ return false;
}
-let previousTabId = "chat-tab-button";
document.addEventListener("keydown", function(event) {
// Stop generation on Esc pressed
if (event.key === "Escape") {
@@ -117,14 +115,14 @@ document.addEventListener("keydown", function(event) {
}
// --- Simple version navigation --- //
- if (!isFocusedOnEditableTextbox()) {
+ if (!isFocusedOnEditableTextbox(event)) {
// Version navigation on Arrow keys (horizontal)
- if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") {
+ if (!isModifiedKeyboardEvent(event) && event.key === "ArrowLeft") {
event.preventDefault();
navigateLastAssistantMessage("left");
}
- else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") {
+ else if (!isModifiedKeyboardEvent(event) && event.key === "ArrowRight") {
event.preventDefault();
if (!navigateLastAssistantMessage("right")) {
// If can't navigate right (last version), regenerate
@@ -159,9 +157,8 @@ targetElement.addEventListener("scroll", function() {
let diff = targetElement.scrollHeight - targetElement.clientHeight;
let isAtBottomNow = Math.abs(targetElement.scrollTop - diff) <= 10 || diff <= 0;
- // Add scrolling class to disable hover effects
if (window.isScrolled || !isAtBottomNow) {
- targetElement.classList.add("scrolling");
+ targetElement.classList.add("scrolling"); // Disables hover effects during scroll
}
if(isAtBottomNow) {
@@ -202,12 +199,8 @@ const observer = new MutationObserver(function() {
});
// Only watch for attribute changes on targetElement (e.g. _generating class)
-const config = {
- attributes: true
-};
-
// Start observing the target element
-observer.observe(targetElement, config);
+observer.observe(targetElement, { attributes: true });
//------------------------------------------------
// Handle syntax highlighting / LaTeX
@@ -228,7 +221,7 @@ window.doSyntaxHighlighting = function() {
if (messageBodies.length > 0) {
let hasSeenVisible = false;
- // Go from last message to first
+ // Go from last message to first so we can early-exit once past visible area
for (let i = messageBodies.length - 1; i >= 0; i--) {
const messageBody = messageBodies[i];
@@ -243,8 +236,8 @@ window.doSyntaxHighlighting = function() {
codeBlock.classList.add("pretty_scrollbar");
});
- // Only render math in visible elements
const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
+ // Only render math in individually visible containers (the outer check is on the message body)
mathContainers.forEach(container => {
if (isElementVisibleOnScreen(container)) {
renderMathInElement(container, {
@@ -271,7 +264,7 @@ const doSyntaxHighlighting = window.doSyntaxHighlighting;
// Add some scrollbars
//------------------------------------------------
const scrollbarElements = document.querySelectorAll(".add_scrollbar textarea, .add_scrollbar .drag-drop-list");
-for(i = 0; i < scrollbarElements.length; i++) {
+for(let i = 0; i < scrollbarElements.length; i++) {
scrollbarElements[i].classList.remove("scroll-hide");
scrollbarElements[i].classList.add("pretty_scrollbar");
scrollbarElements[i].style.resize = "none";
@@ -298,13 +291,13 @@ if (toolsInfo) {
// Remove some backgrounds
//------------------------------------------------
const noBackgroundelements = document.querySelectorAll(".no-background");
-for(i = 0; i < noBackgroundelements.length; i++) {
+for(let i = 0; i < noBackgroundelements.length; i++) {
noBackgroundelements[i].parentNode.style.border = "none";
noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = "center";
}
const slimDropdownElements = document.querySelectorAll(".slim-dropdown");
-for (i = 0; i < slimDropdownElements.length; i++) {
+for (let i = 0; i < slimDropdownElements.length; i++) {
const parentNode = slimDropdownElements[i].parentNode;
parentNode.style.background = "transparent";
parentNode.style.border = "0";
@@ -374,49 +367,43 @@ button.addEventListener("click", function () {
}
});
-// Add event listener for mouseleave on the button
-button.addEventListener("mouseleave", function () {
- // Delay to prevent menu hiding when the mouse leaves the button into the menu
+// Delay to prevent menu hiding when the mouse leaves the button or menu
+function delayedHideMenu() {
setTimeout(function () {
if (!isMouseOverButtonOrMenu()) {
hideMenu();
}
}, 100);
-});
+}
+// Add event listener for mouseleave on the button
+button.addEventListener("mouseleave", delayedHideMenu);
// Add event listener for mouseleave on the menu
-menu.addEventListener("mouseleave", function () {
- // Delay to prevent menu hide when the mouse leaves the menu into the button
- setTimeout(function () {
- if (!isMouseOverButtonOrMenu()) {
- hideMenu();
- }
- }, 100);
-});
+menu.addEventListener("mouseleave", delayedHideMenu);
// Add event listener for click anywhere in the document
document.addEventListener("click", function (event) {
- const target = event.target;
-
// Check if the click is outside the button/menu and the menu is visible
if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") {
hideMenu();
}
- if (event.target.classList.contains("pfp_character")) {
+ const target = event.target;
+
+ if (target.classList.contains("pfp_character")) {
toggleBigPicture();
}
// Handle sidebar clicks on mobile
if (isMobile()) {
- // Check if the click did NOT originate from any of the specified toggle buttons or elements
+ // Check if the click did NOT originate from any of the specified toggle buttons or elements
if (
target.closest("#navigation-toggle") !== navigationToggle &&
- target.closest("#past-chats-toggle") !== pastChatsToggle &&
- target.closest("#chat-controls-toggle") !== chatControlsToggle &&
- target.closest(".header_bar") !== headerBar &&
- target.closest("#past-chats-row") !== pastChatsRow &&
- target.closest("#chat-controls") !== chatControlsRow
+ target.closest("#past-chats-toggle") !== pastChatsToggle &&
+ target.closest("#chat-controls-toggle") !== chatControlsToggle &&
+ target.closest(".header_bar") !== headerBar &&
+ target.closest("#past-chats-row") !== pastChatsRow &&
+ target.closest("#chat-controls") !== chatControlsRow
) {
handleIndividualSidebarClose(event);
}
@@ -433,27 +420,19 @@ document.getElementById("chat-input-row").classList.add("chat-input-positioned")
//------------------------------------------------
const chatTextArea = document.getElementById("chat-input").querySelector("textarea");
-function respondToChatInputVisibility(element, callback) {
- var options = {
- root: document.documentElement,
- };
-
- var observer = new IntersectionObserver((entries, observer) => {
+function focusOnVisible(element) {
+ var observer = new IntersectionObserver((entries) => {
entries.forEach(entry => {
- callback(entry.intersectionRatio > 0);
+ if (entry.intersectionRatio > 0) {
+ element.focus();
+ }
});
- }, options);
+ }, { root: document.documentElement });
observer.observe(element);
}
-function handleChatInputVisibilityChange(isVisible) {
- if (isVisible) {
- chatTextArea.focus();
- }
-}
-
-respondToChatInputVisibility(chatTextArea, handleChatInputVisibilityChange);
+focusOnVisible(chatTextArea);
//------------------------------------------------
// Show enlarged character picture when the profile
@@ -463,8 +442,7 @@ let bigPictureVisible = false;
function addBigPicture() {
var imgElement = document.createElement("img");
- var timestamp = new Date().getTime();
- imgElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
+ imgElement.src = getProfilePictureUrl();
imgElement.classList.add("bigProfilePicture");
imgElement.addEventListener("load", function () {
this.style.visibility = "visible";
@@ -478,9 +456,8 @@ function addBigPicture() {
}
function deleteBigPicture() {
- var bigProfilePictures = document.querySelectorAll(".bigProfilePicture");
- bigProfilePictures.forEach(function (element) {
- element.parentNode.removeChild(element);
+ document.querySelectorAll(".bigProfilePicture").forEach(function (element) {
+ element.remove();
});
}
@@ -494,44 +471,11 @@ function toggleBigPicture() {
}
}
-//------------------------------------------------
-// Handle the chat input box growth
-//------------------------------------------------
-
-// Cache DOM elements
-const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
-const chatInput = document.querySelector("#chat-input textarea");
-
-// Variables to store current dimensions
-let currentChatInputHeight = chatInput.clientHeight;
-
//------------------------------------------------
// Focus on the rename text area when it becomes visible
//------------------------------------------------
const renameTextArea = document.getElementById("rename-row").querySelector("textarea");
-
-function respondToRenameVisibility(element, callback) {
- var options = {
- root: document.documentElement,
- };
-
- var observer = new IntersectionObserver((entries, observer) => {
- entries.forEach(entry => {
- callback(entry.intersectionRatio > 0);
- });
- }, options);
-
- observer.observe(element);
-}
-
-
-function handleVisibilityChange(isVisible) {
- if (isVisible) {
- renameTextArea.focus();
- }
-}
-
-respondToRenameVisibility(renameTextArea, handleVisibilityChange);
+focusOnVisible(renameTextArea);
//------------------------------------------------
// Adjust the chat tab margin if no extension UI
@@ -737,21 +681,21 @@ function handleIndividualSidebarClose(event) {
// Close navigation bar if click is outside and it is open
if (!headerBar.contains(target) && !headerBar.classList.contains("sidebar-hidden")) {
- toggleSidebar(headerBar, navigationToggle, true);
+ toggleSidebar(headerBar, navigationToggle);
}
// Close past chats row if click is outside and it is open
if (!pastChatsRow.contains(target) && !pastChatsRow.classList.contains("sidebar-hidden")) {
- toggleSidebar(pastChatsRow, pastChatsToggle, true);
+ toggleSidebar(pastChatsRow, pastChatsToggle);
}
// Close chat controls row if click is outside and it is open
if (!chatControlsRow.contains(target) && !chatControlsRow.classList.contains("sidebar-hidden")) {
- toggleSidebar(chatControlsRow, chatControlsToggle, true);
+ toggleSidebar(chatControlsRow, chatControlsToggle);
}
}
-function toggleSidebar(sidebar, toggle, forceClose = false) {
+function toggleSidebar(sidebar, toggle) {
const isCurrentlyHidden = sidebar.classList.contains("sidebar-hidden");
const shouldClose = !isCurrentlyHidden;
@@ -776,11 +720,6 @@ function toggleSidebar(sidebar, toggle, forceClose = false) {
toggle.classList.toggle("chat-controls-open", !shouldClose);
toggle.innerHTML = shouldClose ? leftArrowSVG : rightArrowSVG;
}
-
- // Mobile handling
- if (isMobile()) {
- sidebar.classList.toggle("sidebar-shown", !shouldClose);
- }
}
// Function to check if the device is mobile
@@ -840,17 +779,17 @@ pastChatsToggle.addEventListener("click", () => {
const isCurrentlyOpen = !pastChatsRow.classList.contains("sidebar-hidden");
toggleSidebar(pastChatsRow, pastChatsToggle);
- // On desktop, open/close both sidebars at the same time
+ // On desktop, sync both sidebars together
if (!isMobile()) {
if (isCurrentlyOpen) {
// If we just closed the left sidebar, also close the right sidebar
if (!chatControlsRow.classList.contains("sidebar-hidden")) {
- toggleSidebar(chatControlsRow, chatControlsToggle, true);
+ toggleSidebar(chatControlsRow, chatControlsToggle);
}
} else {
// If we just opened the left sidebar, also open the right sidebar
if (chatControlsRow.classList.contains("sidebar-hidden")) {
- toggleSidebar(chatControlsRow, chatControlsToggle, false);
+ toggleSidebar(chatControlsRow, chatControlsToggle);
}
}
}
@@ -860,17 +799,17 @@ chatControlsToggle.addEventListener("click", () => {
const isCurrentlyOpen = !chatControlsRow.classList.contains("sidebar-hidden");
toggleSidebar(chatControlsRow, chatControlsToggle);
- // On desktop, open/close both sidebars at the same time
+ // On desktop, sync both sidebars together
if (!isMobile()) {
if (isCurrentlyOpen) {
// If we just closed the right sidebar, also close the left sidebar
if (!pastChatsRow.classList.contains("sidebar-hidden")) {
- toggleSidebar(pastChatsRow, pastChatsToggle, true);
+ toggleSidebar(pastChatsRow, pastChatsToggle);
}
} else {
// If we just opened the right sidebar, also open the left sidebar
if (pastChatsRow.classList.contains("sidebar-hidden")) {
- toggleSidebar(pastChatsRow, pastChatsToggle, false);
+ toggleSidebar(pastChatsRow, pastChatsToggle);
}
}
}
@@ -890,7 +829,7 @@ if (isMobile()) {
const textarea = document.querySelector("#chat-input textarea");
if (textarea) {
- // Simulate adding and removing a newline
+ // Force textarea height recalculation by simulating content change
textarea.value += "\n";
textarea.dispatchEvent(new Event("input", { bubbles: true }));
textarea.value = textarea.value.slice(0, -1);
diff --git a/js/save_files.js b/js/save_files.js
index bdb0e334..c3cbf9ff 100644
--- a/js/save_files.js
+++ b/js/save_files.js
@@ -1,10 +1,9 @@
// Functions for downloading JSON files
function getCurrentTimestamp() {
const now = new Date();
- const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert to milliseconds
+ const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert minutes to milliseconds
const localTime = new Date(now.getTime() - timezoneOffset);
- const formattedTimestamp = localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
- return formattedTimestamp;
+ return localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
}
function saveFile(contents, filename) {
@@ -18,23 +17,18 @@ function saveFile(contents, filename) {
}
function saveHistory(history, character, mode) {
- let path = null;
+ let path;
if (["chat", "chat-instruct"].includes(mode) && character && character.trim() !== "") {
path = `history_${character}_${getCurrentTimestamp()}.json`;
} else {
- try {
- path = `history_${mode}_${getCurrentTimestamp()}.json`;
- } catch (error) {
- path = `history_${getCurrentTimestamp()}.json`;
- }
+ path = `history_${mode || "unknown"}_${getCurrentTimestamp()}.json`;
}
+
saveFile(history, path);
}
function saveSession(session) {
- let path = null;
-
- path = `session_${getCurrentTimestamp()}.json`;
+ const path = `session_${getCurrentTimestamp()}.json`;
saveFile(session, path);
}
diff --git a/js/show_controls.js b/js/show_controls.js
index ff513395..d5642dc4 100644
--- a/js/show_controls.js
+++ b/js/show_controls.js
@@ -1,13 +1,11 @@
-const chatParent = document.querySelector(".chat-parent");
-
function toggle_controls(value) {
+ const navToggle = document.getElementById("navigation-toggle");
+ const pastChatsToggle = document.getElementById("past-chats-toggle");
const extensions = document.querySelector("#extensions");
+ const galleryExtension = document.getElementById("gallery-extension");
if (value) {
// SHOW MODE: Click toggles to show hidden sidebars
- const navToggle = document.getElementById("navigation-toggle");
- const pastChatsToggle = document.getElementById("past-chats-toggle");
-
if (navToggle && document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
navToggle.click();
}
@@ -19,17 +17,11 @@ function toggle_controls(value) {
if (extensions) {
extensions.style.display = "inherit";
}
-
- let gallery_element = document.getElementById("gallery-extension");
- if (gallery_element) {
- gallery_element.style.display = "block";
+ if (galleryExtension) {
+ galleryExtension.style.display = "block";
}
-
} else {
// HIDE MODE: Click toggles to hide visible sidebars
- const navToggle = document.getElementById("navigation-toggle");
- const pastChatsToggle = document.getElementById("past-chats-toggle");
-
if (navToggle && !document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
navToggle.click();
}
@@ -41,5 +33,8 @@ function toggle_controls(value) {
if (extensions) {
extensions.style.display = "none";
}
+ if (galleryExtension) {
+ galleryExtension.style.display = "none";
+ }
}
}
diff --git a/js/switch_tabs.js b/js/switch_tabs.js
index 36e5736b..a1b44ef3 100644
--- a/js/switch_tabs.js
+++ b/js/switch_tabs.js
@@ -2,17 +2,9 @@ function scrollToTop() {
window.scrollTo({ top: 0 });
}
-function findButtonsByText(buttonText) {
- const buttons = document.getElementsByTagName("button");
- const matchingButtons = [];
-
- for (let i = 0; i < buttons.length; i++) {
- if (buttons[i].textContent.trim() === buttonText) {
- matchingButtons.push(buttons[i]);
- }
- }
-
- return matchingButtons;
+function findButtonsByText(buttonText, container = document) {
+ return Array.from(container.getElementsByTagName("button"))
+ .filter(btn => btn.textContent.trim() === buttonText);
}
function switch_to_chat() {
@@ -39,13 +31,9 @@ function switch_to_character() {
function switch_to_image_ai_generate() {
const container = document.querySelector("#image-ai-tab");
- const buttons = container.getElementsByTagName("button");
-
- for (let i = 0; i < buttons.length; i++) {
- if (buttons[i].textContent.trim() === "Generate") {
- buttons[i].click();
- break;
- }
+ const generateBtn = findButtonsByText("Generate", container)[0];
+ if (generateBtn) {
+ generateBtn.click();
}
scrollToTop();
diff --git a/js/update_big_picture.js b/js/update_big_picture.js
index ec51d63b..8f638c99 100644
--- a/js/update_big_picture.js
+++ b/js/update_big_picture.js
@@ -1,7 +1,6 @@
function updateBigPicture() {
var existingElement = document.querySelector(".bigProfilePicture");
if (existingElement) {
- var timestamp = new Date().getTime();
- existingElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
+ existingElement.src = getProfilePictureUrl();
}
}
diff --git a/modules/extensions.py b/modules/extensions.py
index 09db9f40..afe847f0 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -191,21 +191,19 @@ def _apply_custom_generate_reply():
def _apply_custom_css():
- all_css = ''
- for extension, _ in iterator():
- if hasattr(extension, 'custom_css'):
- all_css += getattr(extension, 'custom_css')()
-
- return all_css
+ return ''.join(
+ getattr(extension, 'custom_css')()
+ for extension, _ in iterator()
+ if hasattr(extension, 'custom_css')
+ )
def _apply_custom_js():
- all_js = ''
- for extension, _ in iterator():
- if hasattr(extension, 'custom_js'):
- all_js += getattr(extension, 'custom_js')()
-
- return all_js
+ return ''.join(
+ getattr(extension, 'custom_js')()
+ for extension, _ in iterator()
+ if hasattr(extension, 'custom_js')
+ )
def create_extensions_block():
From 71c1a52afe54ab599ab5849ae80f1d5a3a72fb5a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 30 Mar 2026 20:49:38 -0700
Subject: [PATCH 12/27] API: Implement echo + logprobs for /v1/completions
endpoint
---
modules/api/completions.py | 299 ++++++++++++++++++++++++++++++------
modules/exllamav3.py | 26 +++-
modules/llama_cpp_server.py | 39 ++++-
3 files changed, 309 insertions(+), 55 deletions(-)
diff --git a/modules/api/completions.py b/modules/api/completions.py
index 8948bb86..587ad6ea 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -39,6 +39,129 @@ def load_chat_template_file(filepath):
return text
+def _first_token_display_str(token_id, prompt, tokenizer):
+ """Return the display string for the first prompt token.
+
+ Returns empty string for BOS or tokens that don't appear at the start
+ of the prompt text, so they don't shift text_offset for subsequent tokens.
+ """
+ token_id = int(token_id)
+ bos_id = getattr(tokenizer, 'bos_token_id', None)
+ if bos_id is not None and token_id == bos_id:
+ return ""
+
+ import torch
+ tok = tokenizer.decode(torch.tensor([token_id]))
+ if not prompt.startswith(tok):
+ return ""
+
+ return tok
+
+
+def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
+ """Compute logprob entries for prompt tokens via a forward pass.
+
+ Returns a list of logprob entries in the standard format.
+ The first token gets a null entry (no conditioning context).
+
+ Supported for HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
+ via a single forward pass, and for llama.cpp via the server's
+ prompt_logprobs parameter. Returns [] for unsupported loaders.
+ """
+ if input_ids is None:
+ input_ids = encode(prompt) # (1, seq_len) tensor or array
+
+ token_ids = input_ids[0]
+ n_tokens = len(token_ids)
+
+ if n_tokens == 0:
+ return []
+
+ loader = shared.args.loader
+ model = shared.model
+
+ if loader == 'llama.cpp':
+ return model.get_prompt_logprob_entries(token_ids, max(logprobs_count, 1), prompt=prompt)
+
+ first_token_str = _first_token_display_str(token_ids[0], prompt, shared.tokenizer)
+
+ if n_tokens <= 1:
+ return [{"token": first_token_str, "null_logprob": True}]
+
+ import torch
+
+ if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
+ # Native ExLlamav3: call the underlying Model.forward() directly
+ input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+ with torch.no_grad():
+ logits = model.model.forward(
+ input_ids=input_ids_tensor,
+ params={
+ "attn_mode": "flash_attn",
+ "cache": model.cache,
+ "past_len": 0,
+ "batch_shape": (1, model.max_tokens),
+ }
+ ).float().cpu()
+
+ elif hasattr(model, 'forward'):
+ # HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
+ input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+ if hasattr(model, 'device'):
+ input_ids_tensor = input_ids_tensor.to(model.device)
+ with torch.no_grad():
+ # Pass labels to ensure logits are returned for ALL positions,
+ # not just the last token (some HF wrappers like ExLlamav3_HF
+ # only compute the last-token logits when labels are absent).
+ outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor)
+ logits = outputs.logits.float().cpu()
+
+ else:
+ return []
+
+ entries = [{"token": first_token_str, "null_logprob": True}]
+
+ # Batch logsumexp and topk as single operations across all positions
+ # to avoid per-position kernel launch overhead.
+ prompt_logits = logits[0, :n_tokens - 1] # positions 0..n-2 predict tokens 1..n-1
+ k = min(logprobs_count, prompt_logits.shape[-1])
+ all_top_values, all_top_indices = torch.topk(prompt_logits, k=k, dim=-1)
+ all_lse = torch.logsumexp(prompt_logits, dim=-1)
+ all_top_log_probs = all_top_values - all_lse.unsqueeze(-1)
+
+ # Batch-decode all unique token IDs to avoid O(N*k) individual decode calls
+ unique_ids = set(int(tid) for tid in token_ids[1:])
+ unique_ids.update(int(tid) for tid in all_top_indices.flatten().tolist())
+
+ decoded_strs = {tid: shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids}
+
+ for i in range(1, n_tokens):
+ token_id = int(token_ids[i])
+ idx = i - 1
+ top_log_probs = all_top_log_probs[idx]
+ top_ids = all_top_indices[idx].tolist()
+ actual_token_str = decoded_strs[token_id]
+
+ # Build the top list with the actual prompt token guaranteed at front
+ if token_id in top_ids:
+ actual_lp = top_log_probs[top_ids.index(token_id)].item()
+ alternatives = [
+ {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+ for j in range(k) if top_ids[j] != token_id
+ ]
+ else:
+ actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
+ alternatives = [
+ {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+ for j in range(k - 1) # drop lowest to make room
+ ]
+
+ entry = {"top_logprobs": [{"token": actual_token_str, "logprob": actual_lp}] + alternatives}
+ entries.append(entry)
+
+ return entries
+
+
def _get_raw_logprob_entries(offset=0):
"""Get raw logprob entries from llama.cpp/ExLlamav3 backend, starting from offset.
@@ -65,6 +188,21 @@ def _parse_entry_top(entry):
return entry.get('top_logprobs', entry.get('top_probs', []))
+def _extract_sampled_token(entry, top):
+ """Get the actually sampled token and its logprob from a logprob entry.
+
+ Uses the entry-level token/logprob when available (the actually sampled
+ token), falling back to top[0] (highest-probability alternative) which
+ may differ with non-greedy sampling.
+ """
+ if 'token' in entry:
+ return entry['token'], entry.get('logprob', entry.get('prob', 0))
+
+ token_str = top[0].get('token', '')
+ token_logprob = top[0].get('logprob', top[0].get('prob', 0))
+ return token_str, token_logprob
+
+
def format_chat_logprobs(entries):
"""Format logprob entries into OpenAI chat completions logprobs format.
@@ -79,9 +217,7 @@ def format_chat_logprobs(entries):
if not top:
continue
- chosen = top[0]
- token_str = chosen.get('token', '')
- token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+ token_str, token_logprob = _extract_sampled_token(entry, top)
top_list = []
for item in top:
@@ -118,13 +254,21 @@ def format_completion_logprobs(entries):
offset = 0
for entry in entries:
+ # Handle null logprob entries (first prompt token with echo)
+ if entry.get("null_logprob"):
+ token_str = entry.get("token", "")
+ tokens.append(token_str)
+ token_logprobs.append(None)
+ top_logprobs.append(None)
+ text_offset.append(offset)
+ offset += len(token_str)
+ continue
+
top = _parse_entry_top(entry)
if not top:
continue
- chosen = top[0]
- token_str = chosen.get('token', '')
- token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+ token_str, token_logprob = _extract_sampled_token(entry, top)
tokens.append(token_str)
token_logprobs.append(token_logprob)
@@ -407,7 +551,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
})
max_tokens = generate_params['max_new_tokens']
- if max_tokens in [None, 0]:
+ if max_tokens is not None and max_tokens <= 0:
+ raise InvalidRequestError(message="max_tokens must be greater than 0.", param="max_tokens")
+
+ if max_tokens is None:
generate_params['max_new_tokens'] = 512
generate_params['auto_max_new_tokens'] = True
@@ -652,6 +799,15 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
# common params
generate_params = process_parameters(body, is_legacy=is_legacy)
max_tokens = generate_params['max_new_tokens']
+ if max_tokens is None:
+ generate_params['max_new_tokens'] = 512
+ generate_params['auto_max_new_tokens'] = True
+ max_tokens = 512
+ elif max_tokens < 0:
+ raise InvalidRequestError(message="max_tokens must be greater than or equal to 0.", param="max_tokens")
+ elif max_tokens == 0 and body.get('logprobs') is None:
+ raise InvalidRequestError(message="max_tokens is 0 but no logprobs parameter was specified.", param="max_tokens")
+
generate_params['stream'] = stream
if stop_event is not None:
generate_params['stop_event'] = stop_event
@@ -700,9 +856,17 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
prompt = decode(prompt)[0]
prefix = prompt if echo else ''
- token_count = len(encode(prompt)[0])
+ prompt_input_ids = encode(prompt)
+ token_count = len(prompt_input_ids[0])
total_prompt_token_count += token_count
+ # Compute prompt logprobs once per prompt (shared across n_completions)
+ logprobs_val = body.get('logprobs', None)
+ if echo and logprobs_val is not None and logprobs_val >= 0:
+ prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
+ else:
+ prompt_entries = None
+
original_seed = generate_params.get('seed', -1)
for _n in range(n_completions):
# Increment seed for each completion to ensure diversity (matches llama.cpp native behavior)
@@ -713,29 +877,41 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
logprob_proc.token_alternatives_history.clear()
# generate reply #######################################
- debug_msg({'prompt': prompt, 'generate_params': generate_params})
- generator = generate_reply(prompt, generate_params, is_chat=False)
- answer = ''
-
- for a in generator:
- answer = a
-
- completion_token_count = len(encode(answer)[0])
- total_completion_token_count += completion_token_count
- stop_reason = "stop"
- if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
- stop_reason = "length"
-
- if logprob_proc:
- all_entries = []
- for alt in logprob_proc.token_alternatives_history:
- all_entries.extend(_dict_to_logprob_entries(alt))
- completion_logprobs = format_completion_logprobs(all_entries)
- elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
- raw = getattr(shared.model, 'last_completion_probabilities', None)
- completion_logprobs = format_completion_logprobs(raw)
+ if max_tokens == 0:
+ answer = ''
+ completion_token_count = 0
+ stop_reason = "stop"
else:
- completion_logprobs = None
+ debug_msg({'prompt': prompt, 'generate_params': generate_params})
+ generator = generate_reply(prompt, generate_params, is_chat=False)
+ answer = ''
+
+ for a in generator:
+ answer = a
+
+ completion_token_count = len(encode(answer)[0])
+ stop_reason = "stop"
+ if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+ stop_reason = "length"
+
+ total_completion_token_count += completion_token_count
+
+ if max_tokens == 0:
+ all_entries = []
+ else:
+ if logprob_proc:
+ all_entries = []
+ for alt in logprob_proc.token_alternatives_history:
+ all_entries.extend(_dict_to_logprob_entries(alt))
+ elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
+ all_entries = getattr(shared.model, 'last_completion_probabilities', None) or []
+ else:
+ all_entries = []
+
+ if prompt_entries:
+ all_entries = prompt_entries + all_entries
+
+ completion_logprobs = format_completion_logprobs(all_entries) if all_entries else None
respi = {
"index": choice_index,
@@ -775,7 +951,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
raise InvalidRequestError(message="API Batched generation not yet supported.", param=prompt_str)
prefix = prompt if echo else ''
- token_count = len(encode(prompt)[0])
+ prompt_input_ids = encode(prompt)
+ token_count = len(prompt_input_ids[0])
# Check if usage should be included in streaming chunks per OpenAI spec
stream_options = body.get('stream_options')
@@ -808,37 +985,57 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
return chunk
+ logprobs_val = body.get('logprobs', None)
+ if echo and logprobs_val is not None and logprobs_val >= 0:
+ prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
+ prompt_logprobs_formatted = format_completion_logprobs(prompt_entries) if prompt_entries else None
+ else:
+ prompt_logprobs_formatted = None
+
+ # Clear stale logprobs from any previous request before building the
+ # first chunk, so text_streaming_chunk doesn't pick up old data.
+ if hasattr(shared.model, 'last_completion_probabilities'):
+ shared.model.last_completion_probabilities = []
+ cmpl_logprobs_offset[0] = 0
+
chunk = text_streaming_chunk(prefix)
+ if prompt_logprobs_formatted is not None:
+ chunk[resp_list][0]["logprobs"] = prompt_logprobs_formatted
if include_usage:
chunk['usage'] = None
yield chunk
# generate reply #######################################
- debug_msg({'prompt': prompt, 'generate_params': generate_params})
- generator = generate_reply(prompt, generate_params, is_chat=False)
- answer = ''
- seen_content = ''
- completion_token_count = 0
+ if max_tokens == 0:
+ answer = ''
+ completion_token_count = 0
+ stop_reason = "stop"
+ else:
+ debug_msg({'prompt': prompt, 'generate_params': generate_params})
+ generator = generate_reply(prompt, generate_params, is_chat=False)
+ answer = ''
+ seen_content = ''
+ completion_token_count = 0
- for a in generator:
- answer = a
+ for a in generator:
+ answer = a
- len_seen = len(seen_content)
- new_content = answer[len_seen:]
+ len_seen = len(seen_content)
+ new_content = answer[len_seen:]
- if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
- continue
+ if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
+ continue
- seen_content = answer
- chunk = text_streaming_chunk(new_content)
- if include_usage:
- chunk['usage'] = None
- yield chunk
+ seen_content = answer
+ chunk = text_streaming_chunk(new_content)
+ if include_usage:
+ chunk['usage'] = None
+ yield chunk
- completion_token_count = len(encode(answer)[0])
- stop_reason = "stop"
- if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
- stop_reason = "length"
+ completion_token_count = len(encode(answer)[0])
+ stop_reason = "stop"
+ if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+ stop_reason = "length"
chunk = text_streaming_chunk(suffix)
chunk[resp_list][0]["finish_reason"] = stop_reason
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index f873503a..3782a693 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -489,15 +489,35 @@ class Exllamav3Model:
return
id_to_piece = self.tokenizer.get_id_to_piece_list(True)
+ sampled_ids = result.get("token_ids") # (batch, seq_len) - actually sampled tokens
+ sampled_probs = result.get("token_probs") # (batch, seq_len) - their probabilities
+
+ def _piece(tid):
+ s = id_to_piece[tid] if tid < len(id_to_piece) else f"<{tid}>"
+ return s.replace('\u2581', ' ')
+
+ def _logprob(prob):
+ return math.log(prob) if prob > 0 else float("-inf")
+
# top_k_tokens shape: (batch, seq_len, k), top_k_probs same
for seq_idx in range(top_k_tokens.shape[1]):
entry = {"top_logprobs": []}
for k_idx in range(top_k_tokens.shape[2]):
token_id = top_k_tokens[0, seq_idx, k_idx].item()
prob = top_k_probs[0, seq_idx, k_idx].item()
- token_str = id_to_piece[token_id] if token_id < len(id_to_piece) else f"<{token_id}>"
- logprob = math.log(prob) if prob > 0 else float("-inf")
- entry["top_logprobs"].append({"token": token_str, "logprob": logprob})
+ entry["top_logprobs"].append({"token": _piece(token_id), "logprob": _logprob(prob)})
+
+ # Record the actually sampled token at the entry level so
+ # format_completion_logprobs uses it instead of top_logprobs[0]
+ # (they differ with non-greedy sampling).
+ if sampled_ids is not None:
+ sid = sampled_ids[0, seq_idx].item()
+ entry["token"] = _piece(sid)
+ if sampled_probs is not None:
+ entry["logprob"] = _logprob(sampled_probs[0, seq_idx].item())
+ else:
+ entry["logprob"] = None
+
self.last_completion_probabilities.append(entry)
def generate(self, prompt, state):
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index fa968be1..34080466 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -310,8 +310,45 @@ class LlamaServer:
else:
raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
+ def get_prompt_logprob_entries(self, token_ids, n_probs=5, prompt=""):
+ """Get logprob entries for prompt tokens via a single n_predict=0 request.
+
+ Requires llama.cpp server with prompt_logprobs support.
+ Returns entries in the standard format for format_completion_logprobs().
+ """
+ token_ids_list = token_ids.tolist() if hasattr(token_ids, 'tolist') else list(token_ids)
+
+ url = f"http://127.0.0.1:{self.port}/completion"
+ payload = {
+ "prompt": token_ids_list,
+ "n_predict": 0,
+ "n_probs": n_probs,
+ "prompt_logprobs": True,
+ "stream": False,
+ "cache_prompt": False,
+ }
+
+ response = self.session.post(url, json=payload)
+ result = response.json()
+
+ prompt_probs = result.get("prompt_probabilities", [])
+ if not prompt_probs:
+ return []
+
+ # Null first token (no conditioning context); use empty string for BOS
+ # or tokens that don't appear at the start of the prompt text.
+ first_token_str = self.decode([token_ids_list[0]])
+ if self.bos_token and first_token_str == self.bos_token:
+ first_token_str = ""
+ elif not prompt.startswith(first_token_str):
+ first_token_str = ""
+
+ entries = [{"token": first_token_str, "null_logprob": True}]
+ entries.extend(prompt_probs)
+ return entries
+
def _get_vocabulary_size(self):
- """Get and store the model's maximum context length."""
+ """Get and store the model's vocabulary size."""
url = f"http://127.0.0.1:{self.port}/v1/models"
response = self.session.get(url).json()
From 328534b762f22c82b09babf6b04e289eab4a7fde Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 12:51:07 -0700
Subject: [PATCH 13/27] Update llama.cpp
---
requirements/full/requirements.txt | 8 ++++----
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 4 ++--
requirements/full/requirements_cpu_only.txt | 8 ++++----
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_amd.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 2 +-
requirements/portable/requirements_apple_silicon.txt | 2 +-
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cuda131.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
12 files changed, 26 insertions(+), 26 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 6e11dd2f..57991c9a 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,10 +40,10 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c964eff6..bb47ea4b 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b1dd6a4f..5750b109 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 4d03d280..d8302d3d 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 9d41d069..d3a5c008 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,7 +37,7 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index ff80b6c8..1180b42d 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 318044da..57aa6262 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 1676bffb..894c9199 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 27fc2da8..32b9727f 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 0bbdd30a..73b72832 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index c3ae3c57..ad96bbe2 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index e646c04c..a5df3ad4 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From 4073164be0b305d8ac4a01d4259448370d009a99 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 19:08:37 -0700
Subject: [PATCH 14/27] Fix ExLlamav3 OOM on prompt logprobs and qwen3_5_moe HF
compat
---
modules/api/completions.py | 13 +++++--------
modules/exllamav3.py | 33 ++++-----------------------------
modules/exllamav3_hf.py | 32 ++++++++------------------------
3 files changed, 17 insertions(+), 61 deletions(-)
diff --git a/modules/api/completions.py b/modules/api/completions.py
index 587ad6ea..a15e1f86 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -91,17 +91,14 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
import torch
if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
- # Native ExLlamav3: call the underlying Model.forward() directly
+ # Native ExLlamav3: call the underlying Model.forward() in chunks
+ # to avoid OOM from giant logits tensors (seq_len * vocab_size * 4 bytes)
input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+ input_ids_tensor = input_ids_tensor.view(-1).cpu()
with torch.no_grad():
logits = model.model.forward(
- input_ids=input_ids_tensor,
- params={
- "attn_mode": "flash_attn",
- "cache": model.cache,
- "past_len": 0,
- "batch_shape": (1, model.max_tokens),
- }
+ input_ids=input_ids_tensor.view(1, -1),
+ params={"attn_mode": "flash_attn_nc"}
).float().cpu()
elif hasattr(model, 'forward'):
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 3782a693..7556a908 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -530,39 +530,14 @@ class Exllamav3Model:
def get_logits(self, token_ids, **kwargs):
"""
Process a batch of token_ids and return the logits for the last token.
- This will reset and overwrite the model's cache.
+ Uses flash_attn_nc (no cache) for correct results with recurrent models.
"""
- # Initialize a single params dictionary that will be updated in-place
- params = {
- "cache": self.cache,
- "reconstruct": False,
- "attn_mode": "flash_attn",
- "batch_shape": (1, self.max_tokens),
- "past_len": 0
- }
- params.update(kwargs)
-
- # Process prefix tokens to fill the cache and generate recurrent state
- if token_ids.shape[-1] > 1:
- prefix_ids = token_ids[:, :-1]
-
- # This forward call updates the 'params' dict with the recurrent state
- self.model.forward(
- input_ids=prefix_ids,
- params=params
- )
-
- # Update past_len for the next call
- params["past_len"] = prefix_ids.shape[-1]
-
- # Process the last token, now using the state-filled 'params' dict
- last_token_ids = token_ids[:, -1:]
logits = self.model.forward(
- input_ids=last_token_ids,
- params=params
+ input_ids=token_ids,
+ params={"attn_mode": "flash_attn_nc"}
)
- return logits.float().cpu()
+ return logits[:, -1:, :].float().cpu()
def encode(self, string, **kwargs):
add_bos = kwargs.pop('add_bos', True)
diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index e0ad5002..5e634e22 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -26,6 +26,9 @@ except Exception:
class Exllamav3HF(PreTrainedModel, GenerationMixin):
def __init__(self, model_dir):
hf_config = PretrainedConfig.from_pretrained(model_dir)
+ # Ensure text_config is a proper object, not a dict (fixes qwen3_5_moe + transformers compat)
+ if isinstance(getattr(hf_config, 'text_config', None), dict):
+ hf_config.text_config = PretrainedConfig(**hf_config.text_config)
super().__init__(hf_config)
exl3_config = Config.from_directory(model_dir)
@@ -199,30 +202,11 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
}
).to(input_ids.device).float()
else:
- # Labels path: use cache for cross-chunk attention.
- tokens_to_process = seq_tensor
- all_logits = None
- current_len = 0
-
- for i in range(0, tokens_to_process.shape[0], max_chunk_size):
- chunk = tokens_to_process[i:i + max_chunk_size]
- chunk_logits = self.ex_model.forward(
- input_ids=chunk.view(1, -1),
- params={
- "attn_mode": "flash_attn",
- "cache": ex_cache,
- "past_len": current_len,
- "batch_shape": (1, self.max_tokens),
- }
- ).float()
- current_len += chunk.shape[0]
-
- if all_logits is None:
- all_logits = chunk_logits
- else:
- all_logits = torch.cat([all_logits, chunk_logits], dim=1)
-
- logits = all_logits
+ # Labels path: single pass without cache for correct logits
+ logits = self.ex_model.forward(
+ input_ids=seq_tensor.view(1, -1),
+ params={"attn_mode": "flash_attn_nc"}
+ ).float().cpu()
if is_negative:
self.past_seq_negative = seq_tensor
From a32ce254f275efe473d6624995957b3b6bd51aa1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Apr 2026 20:28:44 -0700
Subject: [PATCH 15/27] Don't pass torch_dtype to transformers, autodetect from
model config
---
modules/transformers_loader.py | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py
index 7f521b8c..5964f012 100644
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@@ -109,7 +109,6 @@ def load_model_HF(model_name):
params = {
'low_cpu_mem_usage': True,
'attn_implementation': shared.args.attn_implementation,
- 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
}
if shared.original_args.trust_remote_code:
@@ -120,6 +119,17 @@ def load_model_HF(model_name):
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.original_args.trust_remote_code)
+ # Determine torch_dtype: respect --bf16 flag, otherwise autodetect
+ # from model config, but never allow float32.
+ if shared.args.bf16:
+ params['torch_dtype'] = torch.bfloat16
+ else:
+ dtype = getattr(config, 'torch_dtype', None) or getattr(getattr(config, 'text_config', None), 'torch_dtype', None)
+ if dtype in (torch.float16, torch.bfloat16):
+ params['torch_dtype'] = dtype
+ else:
+ params['torch_dtype'] = torch.float16
+
if 'chatglm' in model_name.lower():
LoaderClass = AutoModel
else:
From c10c6e87ae0b0085b36e7e13269461744ce04ff6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 07:17:27 -0700
Subject: [PATCH 16/27] API: Add token ids to logprobs output
---
modules/api/completions.py | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/modules/api/completions.py b/modules/api/completions.py
index a15e1f86..453fa07b 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -143,17 +143,17 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
if token_id in top_ids:
actual_lp = top_log_probs[top_ids.index(token_id)].item()
alternatives = [
- {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+ {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
for j in range(k) if top_ids[j] != token_id
]
else:
actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
alternatives = [
- {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()}
+ {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
for j in range(k - 1) # drop lowest to make room
]
- entry = {"top_logprobs": [{"token": actual_token_str, "logprob": actual_lp}] + alternatives}
+ entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives}
entries.append(entry)
return entries
@@ -239,7 +239,7 @@ def format_chat_logprobs(entries):
def format_completion_logprobs(entries):
"""Format logprob entries into OpenAI completions logprobs format.
- Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "text_offset"}
+ Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "top_logprobs_ids": [{token_id: prob}], "text_offset"}
"""
if not entries:
return None
@@ -247,6 +247,7 @@ def format_completion_logprobs(entries):
tokens = []
token_logprobs = []
top_logprobs = []
+ top_logprobs_ids = []
text_offset = []
offset = 0
@@ -257,6 +258,7 @@ def format_completion_logprobs(entries):
tokens.append(token_str)
token_logprobs.append(None)
top_logprobs.append(None)
+ top_logprobs_ids.append(None)
text_offset.append(offset)
offset += len(token_str)
continue
@@ -273,21 +275,28 @@ def format_completion_logprobs(entries):
offset += len(token_str)
top_dict = {}
+ top_dict_ids = {}
for item in top:
t = item.get('token', '')
lp = item.get('logprob', item.get('prob', 0))
top_dict[t] = lp
+ if 'token_id' in item:
+ top_dict_ids[item['token_id']] = lp
top_logprobs.append(top_dict)
+ top_logprobs_ids.append(top_dict_ids if top_dict_ids else None)
if not tokens:
return None
- return {
+ result = {
"tokens": tokens,
"token_logprobs": token_logprobs,
"top_logprobs": top_logprobs,
"text_offset": text_offset
}
+ if any(x is not None for x in top_logprobs_ids):
+ result["top_logprobs_ids"] = top_logprobs_ids
+ return result
def process_parameters(body, is_legacy=False):
From ea1f8c71f2e92dc9ae230b943c605e43ff5c633c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:30:59 -0300
Subject: [PATCH 17/27] API: Optimize prompt logprobs and refactor ExLlamav3
forward pass
---
modules/api/completions.py | 69 ++++++++++++++++++++++++--------------
modules/exllamav3.py | 14 ++++++++
2 files changed, 58 insertions(+), 25 deletions(-)
diff --git a/modules/api/completions.py b/modules/api/completions.py
index 453fa07b..4eb8fdad 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -90,16 +90,8 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
import torch
- if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'):
- # Native ExLlamav3: call the underlying Model.forward() in chunks
- # to avoid OOM from giant logits tensors (seq_len * vocab_size * 4 bytes)
- input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
- input_ids_tensor = input_ids_tensor.view(-1).cpu()
- with torch.no_grad():
- logits = model.model.forward(
- input_ids=input_ids_tensor.view(1, -1),
- params={"attn_mode": "flash_attn_nc"}
- ).float().cpu()
+ if hasattr(model, 'get_prompt_logits'):
+ logits = model.get_prompt_logits(input_ids)
elif hasattr(model, 'forward'):
# HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
@@ -111,26 +103,54 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
# not just the last token (some HF wrappers like ExLlamav3_HF
# only compute the last-token logits when labels are absent).
outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor)
- logits = outputs.logits.float().cpu()
+ logits = outputs.logits # keep on GPU, (1, seq_len, vocab) in model dtype
+ del outputs
else:
return []
entries = [{"token": first_token_str, "null_logprob": True}]
- # Batch logsumexp and topk as single operations across all positions
- # to avoid per-position kernel launch overhead.
- prompt_logits = logits[0, :n_tokens - 1] # positions 0..n-2 predict tokens 1..n-1
- k = min(logprobs_count, prompt_logits.shape[-1])
- all_top_values, all_top_indices = torch.topk(prompt_logits, k=k, dim=-1)
- all_lse = torch.logsumexp(prompt_logits, dim=-1)
- all_top_log_probs = all_top_values - all_lse.unsqueeze(-1)
-
- # Batch-decode all unique token IDs to avoid O(N*k) individual decode calls
+ logprobs_count = max(logprobs_count, 1)
+ k = min(logprobs_count, logits.shape[-1])
+ chunk_size = 2048
unique_ids = set(int(tid) for tid in token_ids[1:])
- unique_ids.update(int(tid) for tid in all_top_indices.flatten().tolist())
- decoded_strs = {tid: shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids}
+ # Process logits in chunks on GPU, only move top-K results to CPU
+ all_top_log_probs_list = []
+ all_top_indices_list = []
+ all_actual_lps = []
+
+ for start in range(0, n_tokens - 1, chunk_size):
+ end = min(start + chunk_size, n_tokens - 1)
+ chunk_logits = logits[0, start:end].float() # (chunk, vocab) on GPU
+ chunk_lse = torch.logsumexp(chunk_logits, dim=-1)
+ chunk_top_values, chunk_top_indices = torch.topk(chunk_logits, k=k, dim=-1)
+ chunk_top_log_probs = chunk_top_values - chunk_lse.unsqueeze(-1)
+
+ # Compute logprob for actual next tokens in this chunk
+ chunk_top_sets = [set(chunk_top_indices[j].tolist()) for j in range(end - start)]
+ for j in range(end - start):
+ actual_tid = int(token_ids[start + j + 1])
+ if actual_tid not in chunk_top_sets[j]:
+ all_actual_lps.append((chunk_logits[j, actual_tid] - chunk_lse[j]).item())
+ else:
+ all_actual_lps.append(None) # will use top_log_probs
+
+ all_top_log_probs_list.append(chunk_top_log_probs.cpu())
+ all_top_indices_list.append(chunk_top_indices.cpu())
+ unique_ids.update(int(tid) for tid in chunk_top_indices.flatten().tolist())
+ del chunk_logits, chunk_lse, chunk_top_values
+
+ del logits
+ torch.cuda.empty_cache()
+
+ all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0)
+ all_top_indices = torch.cat(all_top_indices_list, dim=0)
+
+ unique_ids_list = sorted(unique_ids)
+ decoded_list = shared.tokenizer.batch_decode([[tid] for tid in unique_ids_list]) if hasattr(shared.tokenizer, 'batch_decode') else [shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids_list]
+ decoded_strs = dict(zip(unique_ids_list, decoded_list))
for i in range(1, n_tokens):
token_id = int(token_ids[i])
@@ -139,7 +159,6 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
top_ids = all_top_indices[idx].tolist()
actual_token_str = decoded_strs[token_id]
- # Build the top list with the actual prompt token guaranteed at front
if token_id in top_ids:
actual_lp = top_log_probs[top_ids.index(token_id)].item()
alternatives = [
@@ -147,10 +166,10 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
for j in range(k) if top_ids[j] != token_id
]
else:
- actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item()
+ actual_lp = all_actual_lps[idx]
alternatives = [
{"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
- for j in range(k - 1) # drop lowest to make room
+ for j in range(k - 1)
]
entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives}
diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 7556a908..e1efbfeb 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -527,6 +527,20 @@ class Exllamav3Model:
return output
+ def get_prompt_logits(self, input_ids):
+ """Return logits for all positions via a single no-cache forward pass.
+
+ Used by prompt logprobs computation. Returns (1, seq_len, vocab) on CPU in float32.
+ """
+ import torch
+ input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+ input_ids_tensor = input_ids_tensor.view(1, -1).cpu()
+ with torch.no_grad():
+ return self.model.forward(
+ input_ids=input_ids_tensor,
+ params={"attn_mode": "flash_attn_nc"}
+ ).cpu().float()
+
def get_logits(self, token_ids, **kwargs):
"""
Process a batch of token_ids and return the logits for the last token.
From c50e17bdbe1da850189188afaf0682a952efa0d1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 14:49:31 -0300
Subject: [PATCH 18/27] Add dedicated ik portable requirements files and remove
macOS ik builds
---
.github/workflows/build-everything-tgw.yml | 7 ---
.../build-portable-release-ik-cuda.yml | 9 ++--
.../workflows/build-portable-release-ik.yml | 44 +++----------------
requirements/portable/requirements_ik.txt | 27 ++++++++++++
.../portable/requirements_ik_cpu_only.txt | 27 ++++++++++++
.../portable/requirements_ik_cuda131.txt | 27 ++++++++++++
6 files changed, 91 insertions(+), 50 deletions(-)
create mode 100644 requirements/portable/requirements_ik.txt
create mode 100644 requirements/portable/requirements_ik_cpu_only.txt
create mode 100644 requirements/portable/requirements_ik_cuda131.txt
diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 4de591f4..40d9db5d 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -96,10 +96,3 @@ jobs:
with:
version: ${{ inputs.version }}
config: 'os:ubuntu-22.04'
-
- build_release_ik_macos:
- name: ik macOS
- uses: ./.github/workflows/build-portable-release-ik.yml
- with:
- version: ${{ inputs.version }}
- config: 'os:macos-14'
diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml
index 40b4b92f..331a7653 100644
--- a/.github/workflows/build-portable-release-ik-cuda.yml
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@@ -138,14 +138,13 @@ jobs:
# 3. Prepare requirements file based on CUDA version
cd "text-generation-webui-${VERSION_CLEAN}"
if [[ "$CUDA_VERSION" == "13.1" ]]; then
- REQ_FILE="requirements/portable/requirements_cuda131.txt"
+ REQ_FILE="requirements/portable/requirements_ik_cuda131.txt"
else
- REQ_FILE="requirements/portable/requirements.txt"
+ REQ_FILE="requirements/portable/requirements_ik.txt"
fi
- # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
- sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
- sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true
+ # 4. Inject --ik into start scripts
+ sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
# 5. Install packages
echo "Installing Python packages from $REQ_FILE..."
diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml
index afb2e763..bf54eb0e 100644
--- a/.github/workflows/build-portable-release-ik.yml
+++ b/.github/workflows/build-portable-release-ik.yml
@@ -1,4 +1,4 @@
-name: Build ik CPU and macOS
+name: Build ik CPU
on:
workflow_dispatch:
@@ -57,7 +57,7 @@ jobs:
id: set-matrix
run: |
$matrix = @{
- 'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
+ 'os' = @('ubuntu-22.04', 'windows-2022')
'pyver' = @("3.13")
}
@@ -110,7 +110,6 @@ jobs:
# Define common variables
VERSION="${{ inputs.version }}"
- OS_TYPE="${{ matrix.os }}"
# 1. Set platform-specific variables
if [[ "$RUNNER_OS" == "Windows" ]]; then
@@ -119,21 +118,7 @@ jobs:
PIP_PATH="portable_env/python.exe -m pip"
PACKAGES_PATH="portable_env/Lib/site-packages"
rm start_linux.sh start_macos.sh
- elif [[ "$RUNNER_OS" == "macOS" ]]; then
- if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
- PLATFORM="macos-x86_64"
- PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz"
- REQ_TYPE="apple_intel"
- else
- PLATFORM="macos-arm64"
- PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz"
- REQ_TYPE="apple_silicon"
- fi
- PIP_PATH="portable_env/bin/python -m pip"
- PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
- rm start_linux.sh start_windows.bat
else
- # Linux case
PLATFORM="linux-cpu"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
PIP_PATH="portable_env/bin/python -m pip"
@@ -148,30 +133,13 @@ jobs:
tar -xzf python-build.tar.gz
mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
- # 3. Prepare requirements file based on platform
+ # 3. Prepare requirements file
cd "text-generation-webui-${VERSION_CLEAN}"
-
- # Select requirements file based on platform
- if [[ "$RUNNER_OS" == "macOS" ]]; then
- if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
- REQ_FILE="requirements/portable/requirements_apple_intel.txt"
- else
- REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
- fi
- else
- REQ_FILE="requirements/portable/requirements_cpu_only.txt"
- fi
-
+ REQ_FILE="requirements/portable/requirements_ik_cpu_only.txt"
echo "Using requirements file: $REQ_FILE"
- # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
- if [[ "$RUNNER_OS" == "macOS" ]]; then
- sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
- sed -i '' 's/--portable/--portable --ik/g' start_macos.sh
- else
- sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
- sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
- fi
+ # 4. Inject --ik into start scripts
+ sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
# 5. Install packages
echo "Installing Python packages from $REQ_FILE..."
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
new file mode 100644
index 00000000..2fa037f7
--- /dev/null
+++ b/requirements/portable/requirements_ik.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
new file mode 100644
index 00000000..b43b51c4
--- /dev/null
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# ik_llama.cpp (CPU only)
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
new file mode 100644
index 00000000..12767285
--- /dev/null
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From 8f8b57a029715d07ab164aa22a779ea7ea4619f1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:54:20 -0700
Subject: [PATCH 19/27] Update exllamav3
---
requirements/full/requirements.txt | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 57991c9a..5591c9ca 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -44,7 +44,7 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
From 6a1f720c7bb9aef73c1c7c4e311460174c5255ec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:58:20 -0700
Subject: [PATCH 20/27] Update transformers
---
requirements/full/requirements.txt | 2 +-
requirements/full/requirements_amd.txt | 2 +-
requirements/full/requirements_apple_intel.txt | 2 +-
requirements/full/requirements_apple_silicon.txt | 2 +-
requirements/full/requirements_cpu_only.txt | 2 +-
requirements/full/requirements_nowheels.txt | 2 +-
6 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 5591c9ca..30ee0316 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -25,7 +25,7 @@ sentencepiece
tensorboard
torchao==0.15.*
trafilatura==2.0.0
-transformers==5.3.*
+transformers==5.5.*
triton-windows==3.5.1.post24; platform_system == "Windows"
tqdm
wandb
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index bb47ea4b..9edc1d95 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -22,7 +22,7 @@ scipy
sentencepiece
tensorboard
torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
tqdm
trafilatura==2.0.0
wandb
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 5750b109..ff8687c1 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -22,7 +22,7 @@ scipy
sentencepiece
tensorboard
torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
tqdm
trafilatura==2.0.0
wandb
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d8302d3d..208632e8 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -22,7 +22,7 @@ scipy
sentencepiece
tensorboard
torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
tqdm
trafilatura==2.0.0
wandb
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index d3a5c008..4a7e5aaa 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -22,7 +22,7 @@ scipy
sentencepiece
tensorboard
torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
tqdm
trafilatura==2.0.0
wandb
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 052085cc..6200589e 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -22,7 +22,7 @@ scipy
sentencepiece
tensorboard
torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
tqdm
trafilatura==2.0.0
wandb
From 468cb5cb87bf02f96efcd5acb1d1ac4b08c68273 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:59:28 -0700
Subject: [PATCH 21/27] Update accelerate
---
requirements/full/requirements.txt | 2 +-
requirements/full/requirements_amd.txt | 2 +-
requirements/full/requirements_apple_intel.txt | 2 +-
requirements/full/requirements_apple_silicon.txt | 2 +-
requirements/full/requirements_cpu_only.txt | 2 +-
requirements/full/requirements_nowheels.txt | 2 +-
6 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 30ee0316..e5bec6ec 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
bitsandbytes==0.49.*
datasets
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 9edc1d95..c6b5b2d0 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
datasets
diffusers==0.37.*
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ff8687c1..ce671f0a 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
datasets
diffusers==0.37.*
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 208632e8..d12d9f80 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
datasets
diffusers==0.37.*
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 4a7e5aaa..4066b1af 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
datasets
diffusers==0.37.*
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 6200589e..7173345a 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
datasets
diffusers==0.37.*
From 80e81a54cacacbd8aa16ccf312ae0e574e4b416c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 11:11:44 -0700
Subject: [PATCH 22/27] Remove ik macOS wheels from full requirements
---
requirements/full/requirements_apple_intel.txt | 1 -
requirements/full/requirements_apple_silicon.txt | 1 -
2 files changed, 2 deletions(-)
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ce671f0a..55a313e9 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -38,4 +38,3 @@ tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index d12d9f80..a6d34cbb 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -38,4 +38,3 @@ tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
From f6f8f14c8d0993327a2c86dfa3c976a7c1c569fc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:13:39 -0300
Subject: [PATCH 23/27] Security: Fix SSRF in superbooga extensions
---
extensions/superbooga/download_urls.py | 3 +++
extensions/superboogav2/download_urls.py | 2 ++
2 files changed, 5 insertions(+)
diff --git a/extensions/superbooga/download_urls.py b/extensions/superbooga/download_urls.py
index 424a9885..b28fea42 100644
--- a/extensions/superbooga/download_urls.py
+++ b/extensions/superbooga/download_urls.py
@@ -2,8 +2,11 @@ import concurrent.futures
import requests
+from modules.web_search import _validate_url
+
def download_single(url):
+ _validate_url(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
diff --git a/extensions/superboogav2/download_urls.py b/extensions/superboogav2/download_urls.py
index 5b5a2e17..4d8b98b1 100644
--- a/extensions/superboogav2/download_urls.py
+++ b/extensions/superboogav2/download_urls.py
@@ -5,12 +5,14 @@ import requests
from bs4 import BeautifulSoup
import extensions.superboogav2.parameters as parameters
+from modules.web_search import _validate_url
from .data_processor import process_and_add_to_collector
from .utils import create_metadata_source
def _download_single(url):
+ _validate_url(url)
response = requests.get(url, timeout=5)
if response.status_code == 200:
return response.content
From 091037ec20743ac6c7bccb75b59743045a692c4a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:13:45 -0300
Subject: [PATCH 24/27] Fix top_logprobs_ids missing for llama.cpp loader
---
modules/api/completions.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/modules/api/completions.py b/modules/api/completions.py
index 4eb8fdad..98bcff47 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -299,8 +299,9 @@ def format_completion_logprobs(entries):
t = item.get('token', '')
lp = item.get('logprob', item.get('prob', 0))
top_dict[t] = lp
- if 'token_id' in item:
- top_dict_ids[item['token_id']] = lp
+ tid = item.get('token_id', item.get('id'))
+ if tid is not None:
+ top_dict_ids[tid] = lp
top_logprobs.append(top_dict)
top_logprobs_ids.append(top_dict_ids if top_dict_ids else None)
From a61bde509ff44a0f7662067bc94efd7f103f3162 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:30:02 -0700
Subject: [PATCH 25/27] Update llama.cpp
---
requirements/full/requirements.txt | 8 ++++----
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 2 +-
requirements/full/requirements_apple_silicon.txt | 2 +-
requirements/full/requirements_cpu_only.txt | 8 ++++----
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_amd.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 2 +-
requirements/portable/requirements_apple_silicon.txt | 2 +-
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cuda131.txt | 4 ++--
requirements/portable/requirements_ik.txt | 4 ++--
requirements/portable/requirements_ik_cpu_only.txt | 4 ++--
requirements/portable/requirements_ik_cuda131.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
15 files changed, 30 insertions(+), 30 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index e5bec6ec..f1a953a5 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,10 +40,10 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c6b5b2d0..211600e2 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 55a313e9..54d904dd 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index a6d34cbb..8829eb44 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 4066b1af..0a8cfac6 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,7 +37,7 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 1180b42d..607c642f 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 57aa6262..f0af64c8 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 894c9199..c5f351c5 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 32b9727f..5287aa25 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 73b72832..038318ab 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index ad96bbe2..d87c741e 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 2fa037f7..3e2471ae 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index b43b51c4..8272b9b6 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# ik_llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 12767285..98ef23d7 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index a5df3ad4..157ad313 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From d84157403a1c8b65f8597302463e46c28a6659d1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:31:44 -0700
Subject: [PATCH 26/27] Update the custom gradio wheels
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 4 ++--
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_nowheels.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_amd.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 4 ++--
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cuda131.txt | 4 ++--
requirements/portable/requirements_ik.txt | 4 ++--
requirements/portable/requirements_ik_cpu_only.txt | 4 ++--
requirements/portable/requirements_ik_cuda131.txt | 4 ++--
requirements/portable/requirements_nowheels.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
17 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index f1a953a5..b38ae848 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 211600e2..7fb3a7d9 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 54d904dd..4a0f764c 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 8829eb44..942d5d71 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 0a8cfac6..6b61dca7 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 7173345a..a4d6cc97 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
wandb
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 607c642f..5aff54b2 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index f0af64c8..0771f53e 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index c5f351c5..427d59b2 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 5287aa25..c47a6ca1 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 038318ab..e491e357 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index d87c741e..5870983a 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt
index 3e2471ae..d11d337d 100644
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt
index 8272b9b6..c2b69e1c 100644
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt
index 98ef23d7..7f280930 100644
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index e38140ce..322056be 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 157ad313..dfd52be5 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
From 7aab2fdf9aefb0f14fbf58e132a2a9a5850f8319 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:50:42 -0700
Subject: [PATCH 27/27] API: Improve cache clearing in logprobs
---
modules/api/completions.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/modules/api/completions.py b/modules/api/completions.py
index 98bcff47..f2282731 100644
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@@ -89,6 +89,7 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
return [{"token": first_token_str, "null_logprob": True}]
import torch
+ from modules.torch_utils import clear_torch_cache
if hasattr(model, 'get_prompt_logits'):
logits = model.get_prompt_logits(input_ids)
@@ -143,7 +144,7 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
del chunk_logits, chunk_lse, chunk_top_values
del logits
- torch.cuda.empty_cache()
+ clear_torch_cache()
all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0)
all_top_indices = torch.cat(all_top_indices_list, dim=0)