From 807be1183272fac409ce8f08609dbdd0d9f63362 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 24 Mar 2026 18:48:50 -0700 Subject: [PATCH 01/27] Remove obsolete models/config.yaml and related code --- docs/01 - Chat Tab.md | 2 +- docs/12 - OpenAI API.md | 2 +- modules/models.py | 1 - modules/models_settings.py | 9 +- modules/shared.py | 10 -- server.py | 5 - user_data/models/config.yaml | 203 ----------------------------------- 7 files changed, 4 insertions(+), 228 deletions(-) delete mode 100644 user_data/models/config.yaml diff --git a/docs/01 - Chat Tab.md b/docs/01 - Chat Tab.md index 5104895f..96b232fa 100644 --- a/docs/01 - Chat Tab.md +++ b/docs/01 - Chat Tab.md @@ -112,7 +112,7 @@ Used for talking to an instruction-following model using the prompt format defin The prompt format is defined by the **Instruction template** parameter in "Parameters" > "Instruction template", which represents a Jinja2 template. -Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any), and will update the values under "Parameters" > "Instruction template" accordingly. This is done using a set of regular expressions defined in `user_data/models/config.yaml`. This detection is not guaranteed to be accurate. You should check the model card on Hugging Face to see if you are using the correct prompt format. +Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any) from the model metadata (e.g. `tokenizer_config.json` or GGUF metadata), and will update the values under "Parameters" > "Instruction template" accordingly. You should check the model card on Hugging Face to see if you are using the correct prompt format. ### Chat-instruct diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index 2a7a7f69..0a076c35 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -39,7 +39,7 @@ curl http://127.0.0.1:5000/v1/completions \ #### Chat completions -Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be guessed automatically based on the model name using the regex patterns in `user_data/models/config.yaml`. +Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be detected automatically from the model metadata. ```shell curl http://127.0.0.1:5000/v1/chat/completions \ diff --git a/modules/models.py b/modules/models.py index 1d139b89..b2665c6b 100644 --- a/modules/models.py +++ b/modules/models.py @@ -67,7 +67,6 @@ def load_model(model_name, loader=None): logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.") logger.info(f"LOADER: \"{loader}\"") logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}") - logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"") return model, tokenizer diff --git a/modules/models_settings.py b/modules/models_settings.py index dcface71..eafa0581 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -23,14 +23,9 @@ def get_fallback_settings(): def get_model_metadata(model): model_path = resolve_model_path(model) - model_settings = {} - # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml - settings = shared.model_config - for pat in settings: - if re.match(pat.lower(), Path(model).name.lower()): - for k in settings[pat]: - model_settings[k] = settings[pat][k] + # Fallback settings + model_settings = get_fallback_settings() path = model_path / 'config.json' if path.exists(): diff --git a/modules/shared.py b/modules/shared.py index 16ccbe77..acb103b4 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -454,17 +454,7 @@ def load_user_config(): args.loader = fix_loader_name(args.loader) -# Load model-specific settings -p = Path(f'{args.model_dir}/config.yaml') -if p.exists(): - model_config = yaml.safe_load(open(p, 'r').read()) -else: - model_config = {} -del p - - # Load custom model-specific settings user_config = load_user_config() -model_config = OrderedDict(model_config) user_config = OrderedDict(user_config) diff --git a/server.py b/server.py index d224909c..88936ca6 100644 --- a/server.py +++ b/server.py @@ -18,7 +18,6 @@ import modules.extensions as extensions_module from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model_if_idle from modules.models_settings import ( - get_fallback_settings, get_model_metadata, update_model_parameters ) @@ -271,10 +270,6 @@ if __name__ == "__main__": # Apply CLI overrides for image model settings (CLI flags take precedence over saved settings) shared.apply_image_model_cli_overrides() - # Fallback settings for models - shared.model_config['.*'] = get_fallback_settings() - shared.model_config.move_to_end('.*', last=False) # Move to the beginning - # Activate the extensions listed on settings.yaml extensions_module.available_extensions = utils.get_available_extensions() for extension in shared.settings['default_extensions']: diff --git a/user_data/models/config.yaml b/user_data/models/config.yaml deleted file mode 100644 index 038ebcf1..00000000 --- a/user_data/models/config.yaml +++ /dev/null @@ -1,203 +0,0 @@ -.*(llama|alpac|vicuna|guanaco|koala|llava|wizardlm|metharme|pygmalion-7b|pygmalion-2|mythalion|wizard-mega|openbuddy|vigogne|h2ogpt-research|manticore): - model_type: 'llama' -.*(opt-|opt_|opt1|opt3|optfor|galactica|galpaca|pygmalion-350m): - model_type: 'opt' -.*(gpt-j|gptj|gpt4all-j|malion-6b|pygway|pygmalion-6b|dolly-v1): - model_type: 'gptj' -.*(gpt-neox|koalpaca-polyglot|polyglot.*koalpaca|polyglot-ko|polyglot_ko|pythia|stablelm|incite|dolly-v2|polycoder|h2ogpt-oig|h2ogpt-oasst1|h2ogpt-gm): - model_type: 'gptneox' -.*bloom: - model_type: 'bloom' -.*gpt2: - model_type: 'gpt2' -.*falcon: - model_type: 'falcon' -.*mpt: - model_type: 'mpt' -.*(starcoder|starchat): - model_type: 'starcoder' -.*dolly-v2: - model_type: 'dollyv2' -.*replit: - model_type: 'replit' -.*(oasst|openassistant-|stablelm-7b-sft-v7-epoch-3): - instruction_template: 'Open Assistant' - skip_special_tokens: false -(?!.*galactica)(?!.*reward).*openassistant: - instruction_template: 'Open Assistant' - skip_special_tokens: false -.*galactica: - skip_special_tokens: false -.*dolly-v[0-9]-[0-9]*b: - instruction_template: 'Alpaca' - skip_special_tokens: false -.*alpaca-native-4bit: - instruction_template: 'Alpaca' -.*llava: - instruction_template: 'LLaVA' -.*llava.*1.5: - instruction_template: 'Vicuna-v1.1' -.*wizard.*mega: - instruction_template: 'Wizard-Mega' -.*starchat-beta: - instruction_template: 'Starchat-Beta' -(?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna: - instruction_template: 'Vicuna-v0' -.*vicuna.*v0: - instruction_template: 'Vicuna-v0' -.*vicuna.*(1.1|1_1|1.3|1_3): - instruction_template: 'Vicuna-v1.1' -.*vicuna.*(1.5|1_5): - instruction_template: 'Vicuna-v1.1' -.*stable.*vicuna: - instruction_template: 'StableVicuna' -(?!.*chat).*chinese-vicuna: - instruction_template: 'Alpaca' -.*chinese-vicuna.*chat: - instruction_template: 'Chinese-Vicuna-Chat' -.*alpaca: - instruction_template: 'Alpaca' -.*koala: - instruction_template: 'Koala' -.*chatglm: - instruction_template: 'ChatGLM' -.*(metharme|pygmalion|mythalion): - instruction_template: 'Metharme' -.*raven: - instruction_template: 'RWKV-Raven' -.*moss-moon.*sft: - instruction_template: 'MOSS' -.*stablelm-tuned: - instruction_template: 'StableLM' -.*galactica.*finetuned: - instruction_template: 'Galactica Finetuned' -.*galactica.*-v2: - instruction_template: 'Galactica v2' -(?!.*finetuned)(?!.*-v2).*galactica: - instruction_template: 'Galactica' -.*guanaco: - instruction_template: 'Guanaco non-chat' -.*baize: - instruction_template: 'Baize' -.*mpt-.*instruct: - instruction_template: 'Alpaca' -.*mpt-.*chat: - instruction_template: 'ChatML' -(?!.*-flan-)(?!.*-t5-).*lamini-: - instruction_template: 'Alpaca' -.*incite.*chat: - instruction_template: 'INCITE-Chat' -.*incite.*instruct: - instruction_template: 'INCITE-Instruct' -.*ziya-: - instruction_template: 'Ziya' -.*koalpaca: - instruction_template: 'KoAlpaca' -.*openbuddy: - instruction_template: 'OpenBuddy' -(?!.*chat).*vigogne: - instruction_template: 'Vigogne-Instruct' -.*vigogne.*chat: - instruction_template: 'Vigogne-Chat' -.*(llama-deus|supercot|llama-natural-instructions|open-llama-0.3t-7b-instruct-dolly-hhrlhf|open-llama-0.3t-7b-open-instruct): - instruction_template: 'Alpaca' -.*bactrian: - instruction_template: 'Bactrian' -.*(h2ogpt-oig-|h2ogpt-oasst1-|h2ogpt-research-oasst1-): - instruction_template: 'INCITE-Chat' -.*h2ogpt-gm-: - instruction_template: 'H2O-prompt_answer' -.*manticore: - instruction_template: 'Manticore Chat' -.*bluemoonrp-(30|13)b: - instruction_template: 'Bluemoon' -.*Nous-Hermes-13b: - instruction_template: 'Alpaca' -.*airoboros: - instruction_template: 'Vicuna-v1.1' -.*airoboros.*1.2: - instruction_template: 'Airoboros-v1.2' -.*alpa(cino|sta): - instruction_template: 'Alpaca' -.*hippogriff: - instruction_template: 'Hippogriff' -.*lazarus: - instruction_template: 'Alpaca' -.*guanaco-.*(7|13|33|65)b: - instruction_template: 'Vicuna-v0' -.*hypermantis: - instruction_template: 'Alpaca' -.*open-llama-.*-open-instruct: - instruction_template: 'Alpaca' -.*starcoder-gpteacher-code-instruct: - instruction_template: 'Alpaca' -.*tulu: - instruction_template: 'Tulu' -.*chronos: - instruction_template: 'Alpaca' -.*samantha: - instruction_template: 'Samantha' -.*wizardcoder: - instruction_template: 'Alpaca' -.*minotaur: - instruction_template: 'Manticore Chat' -.*orca_mini: - instruction_template: 'Orca Mini' -.*(platypus|gplatty|superplatty): - instruction_template: 'Alpaca' -.*(openorca-platypus2): - instruction_template: 'OpenOrca-Platypus2' -.*longchat: - instruction_template: 'Vicuna-v1.1' -.*vicuna-33b: - instruction_template: 'Vicuna-v1.1' -.*redmond-hermes-coder: - instruction_template: 'Alpaca' -.*wizardcoder-15b: - instruction_template: 'Alpaca' -.*wizardlm: - instruction_template: 'Vicuna-v1.1' -.*godzilla: - instruction_template: 'Alpaca' -.*llama(-?)(2|v2).*chat: - instruction_template: 'Llama-v2' -.*newhope: - instruction_template: 'NewHope' -.*stablebeluga2: - instruction_template: 'StableBeluga2' -.*openchat: - instruction_template: 'OpenChat' -.*codellama.*instruct: - instruction_template: 'Llama-v2' -.*(mistral|mixtral).*instruct: - instruction_template: 'Mistral' -.*mistral.*openorca: - instruction_template: 'ChatML' -.*(WizardCoder-Python-34B-V1.0|Phind-CodeLlama-34B-v2|CodeBooga-34B-v0.1): - instruction_template: 'Alpaca' -.*orca-2-(13|7)b: - instruction_template: 'ChatML' -.*openhermes.*mistral: - instruction_template: 'ChatML' -.*Yi-34B-Chat: - instruction_template: 'ChatML' -(dolphin).*: - instruction_template: 'ChatML' -.*synthia: - instruction_template: 'Synthia' -.*(hercules|hyperion): - instruction_template: 'ChatML' -.*command-r: - instruction_template: 'Command-R' -.*xwin-lm-70b-v0.1: - instruction_template: 'Vicuna-v1.1' -.*platypus-yi-34b: - instruction_template: 'Vicuna-v1.1' -.*CausalLM-RP-34B: - instruction_template: 'ChatML' -34b-beta: - instruction_template: 'ChatML' -.*airoboros-3_1-yi-34b-200k: - instruction_template: 'Llama-v2' -.*chatqa: - instruction_template: 'NVIDIA-ChatQA' From d6f1485dd189494f6fbe5b6ea7ebd5cc0404233a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 24 Mar 2026 21:45:11 -0700 Subject: [PATCH 02/27] UI: Update the enable_thinking info message --- modules/ui_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_chat.py b/modules/ui_chat.py index f1dc7883..10d05f65 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -82,7 +82,7 @@ def create_ui(): gr.HTML("") shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.') - shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.') + shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='For models with thinking support.') gr.HTML("") From 368f37335f634ba001d00d2841902de85c7b48db Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 25 Mar 2026 06:37:45 -0700 Subject: [PATCH 03/27] Fix --idle-timeout issues with encode/decode and parallel generation --- modules/logits.py | 4 +--- modules/models.py | 15 ++++++++++++++- modules/text_generation.py | 18 +++++++++++++----- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/modules/logits.py b/modules/logits.py index 1f878f27..473f5890 100644 --- a/modules/logits.py +++ b/modules/logits.py @@ -4,7 +4,6 @@ import numpy as np from modules import models, shared from modules.logging_colors import logger -from modules.models import load_model from modules.text_generation import generate_reply from modules.utils import check_model_loaded @@ -12,8 +11,7 @@ global_scores = None def get_next_logits(*args, **kwargs): - if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']: - shared.model, shared.tokenizer = load_model(shared.model_name) + models.load_model_if_idle_unloaded() needs_lock = not args[2] # use_samplers if needs_lock: diff --git a/modules/models.py b/modules/models.py index b2665c6b..61ca3838 100644 --- a/modules/models.py +++ b/modules/models.py @@ -1,4 +1,5 @@ import sys +import threading import time import modules.shared as shared @@ -7,6 +8,15 @@ from modules.models_settings import get_model_metadata from modules.utils import resolve_model_path last_generation_time = time.time() +active_generation_count = 0 +_generation_count_lock = threading.Lock() + + +def load_model_if_idle_unloaded(): + global last_generation_time + if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']: + shared.model, shared.tokenizer = load_model(shared.model_name) + last_generation_time = time.time() def load_model(model_name, loader=None): @@ -158,7 +168,10 @@ def unload_model_if_idle(): while True: shared.generation_lock.acquire() try: - if time.time() - last_generation_time > shared.args.idle_timeout * 60: + with _generation_count_lock: + is_active = active_generation_count > 0 + + if not is_active and time.time() - last_generation_time > shared.args.idle_timeout * 60: if shared.model is not None: logger.info("Unloading the model for inactivity.") unload_model(keep_model_name=True) diff --git a/modules/text_generation.py b/modules/text_generation.py index f77be124..3a9ddab5 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -17,9 +17,7 @@ from modules.utils import check_model_loaded def generate_reply(*args, **kwargs): - if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']: - from modules.models import load_model - shared.model, shared.tokenizer = load_model(shared.model_name) + models.load_model_if_idle_unloaded() state = args[1] if len(args) > 1 else kwargs.get('state', {}) use_parallel = ( @@ -31,10 +29,16 @@ def generate_reply(*args, **kwargs): if not use_parallel: shared.generation_lock.acquire() + with models._generation_count_lock: + models.active_generation_count += 1 + try: for result in _generate_reply(*args, **kwargs): yield result finally: + with models._generation_count_lock: + models.active_generation_count -= 1 + models.last_generation_time = time.time() if not use_parallel: shared.generation_lock.release() @@ -126,7 +130,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None): if shared.tokenizer is None: - raise ValueError('No tokenizer is loaded') + models.load_model_if_idle_unloaded() + if shared.tokenizer is None: + raise ValueError('No tokenizer is loaded') # llama.cpp case if shared.model.__class__.__name__ == 'LlamaServer': @@ -176,7 +182,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt def decode(output_ids, skip_special_tokens=True): if shared.tokenizer is None: - raise ValueError('No tokenizer is loaded') + models.load_model_if_idle_unloaded() + if shared.tokenizer is None: + raise ValueError('No tokenizer is loaded') return shared.tokenizer.decode(output_ids, skip_special_tokens=skip_special_tokens) From e1541400219043f9b9cebf5f002b48251efc8bf9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 25 Mar 2026 07:21:02 -0700 Subject: [PATCH 04/27] Rename "truncation length" to "context length" in logs --- modules/api/models.py | 2 +- modules/models.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/api/models.py b/modules/api/models.py index c879a860..b89397d3 100644 --- a/modules/api/models.py +++ b/modules/api/models.py @@ -68,7 +68,7 @@ def _load_model(data): if k in shared.settings: shared.settings[k] = settings[k] if k == 'truncation_length': - logger.info(f"TRUNCATION LENGTH (UPDATED): {shared.settings['truncation_length']}") + logger.info(f"CONTEXT LENGTH (UPDATED): {shared.settings['truncation_length']}") elif k == 'instruction_template': logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}") diff --git a/modules/models.py b/modules/models.py index 61ca3838..e997d2d8 100644 --- a/modules/models.py +++ b/modules/models.py @@ -76,7 +76,7 @@ def load_model(model_name, loader=None): logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.") logger.info(f"LOADER: \"{loader}\"") - logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}") + logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}") return model, tokenizer From 4cbea02ed4e0dee2efd066ac48bcdf33631b9eca Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 26 Mar 2026 06:49:39 -0700 Subject: [PATCH 05/27] Add ik_llama.cpp support via `--ik` flag --- modules/llama_cpp_server.py | 37 +++++++++++++++++++++++++++++++++++++ modules/shared.py | 1 + 2 files changed, 38 insertions(+) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 2ae01ddc..9b9756a9 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -470,6 +470,10 @@ class LlamaServer: else: cmd.append(f"--{flag_item}") + # Patch flags for ik_llama.cpp compatibility + if shared.args.ik: + cmd = _patch_cmd_for_ik(cmd) + env = os.environ.copy() if os.name == 'posix': current_path = env.get('LD_LIBRARY_PATH', '') @@ -607,3 +611,36 @@ def filter_stderr_with_progress(process_stderr): process_stderr.close() except Exception: pass + + +def _patch_cmd_for_ik(cmd): + """ + Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents: + --no-webui → --webui none + --fit off → (removed) + --fit on / --fit-ctx → --fit (bare flag) + --fit-target → --fit-margin + """ + patched = [] + i = 0 + while i < len(cmd): + arg = cmd[i] + + if arg == "--no-webui": + patched += ["--webui", "none"] + elif arg == "--fit" and i + 1 < len(cmd) and cmd[i + 1] in ("on", "off"): + val = cmd[i + 1] + i += 1 + if val == "on": + patched.append("--fit") + # "off" → drop entirely + elif arg == "--fit-ctx": + i += 1 # skip the value + elif arg == "--fit-target": + patched.append("--fit-margin") + else: + patched.append(arg) + + i += 1 + + return patched diff --git a/modules/shared.py b/modules/shared.py index acb103b4..c50736d7 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -110,6 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.') group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.') group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"') +group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside /lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') From bda95172bd6abecba165fc118f140cfc446f3c42 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 28 Mar 2026 06:09:53 -0700 Subject: [PATCH 06/27] Fix stopping string detection for chromadb/context-1 --- modules/chat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index f8088e0f..edda11b0 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -671,7 +671,10 @@ def get_stopping_strings(state): # Handle GPT-OSS as a special case if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result: result.remove("<|end|>") - result.append("<|result|>") + if '<|result|>' in state['instruction_template_str']: + result.append("<|result|>") + elif '<|return|>' in state['instruction_template_str']: + result.append("<|return|>") result = list(set(result)) if shared.args.verbose: From 9dd04b86ce407507bcaf0862b97aadc64b6e62a6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 28 Mar 2026 06:17:57 -0700 Subject: [PATCH 07/27] Suppress EOS token at logit level for ExLlamav3 when ban_eos_token is set --- modules/exllamav3.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 75c76c7c..f873503a 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -423,6 +423,15 @@ class Exllamav3Model: if logit_bias: filters.append(LogitBiasFilter(self.tokenizer, logit_bias)) + # Suppress EOS tokens via logit bias so they are never sampled + if state['ban_eos_token']: + eos_bias = {} + for eos_id in self.config.eos_token_id_list: + if eos_id is not None: + eos_bias[str(eos_id)] = float('-inf') + if eos_bias: + filters.append(LogitBiasFilter(self.tokenizer, eos_bias)) + # Logprobs support (OpenAI API) logprobs = state.get('logprobs', 0) or 0 return_top_tokens = logprobs if logprobs > 0 else 0 From 4979e87e48c78d5e3186e4d9b2fbc8b30e86164f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 28 Mar 2026 11:49:47 -0300 Subject: [PATCH 08/27] Add ik_llama.cpp support via ik_llama_cpp_binaries package --- .github/workflows/build-everything-tgw.yml | 35 +++ .../build-portable-release-ik-cuda.yml | 179 +++++++++++++++ .../workflows/build-portable-release-ik.yml | 205 ++++++++++++++++++ modules/llama_cpp_server.py | 21 +- modules/loaders.py | 2 + modules/shared.py | 2 +- modules/ui_model_menu.py | 3 + requirements/full/requirements.txt | 6 +- requirements/full/requirements_amd.txt | 4 +- .../full/requirements_apple_intel.txt | 3 +- .../full/requirements_apple_silicon.txt | 3 +- requirements/full/requirements_cpu_only.txt | 6 +- requirements/portable/requirements.txt | 4 +- requirements/portable/requirements_amd.txt | 4 +- .../portable/requirements_apple_intel.txt | 2 +- .../portable/requirements_apple_silicon.txt | 2 +- .../portable/requirements_cpu_only.txt | 4 +- .../portable/requirements_cuda131.txt | 4 +- requirements/portable/requirements_vulkan.txt | 4 +- 19 files changed, 469 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/build-portable-release-ik-cuda.yml create mode 100644 .github/workflows/build-portable-release-ik.yml diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml index 9322f859..4de591f4 100644 --- a/.github/workflows/build-everything-tgw.yml +++ b/.github/workflows/build-everything-tgw.yml @@ -68,3 +68,38 @@ jobs: with: version: ${{ inputs.version }} config: 'os:macos-15-intel,macos-14' + + build_release_ik_cuda_windows: + name: ik CUDA Windows + uses: ./.github/workflows/build-portable-release-ik-cuda.yml + with: + version: ${{ inputs.version }} + config: 'os:windows-2022' + + build_release_ik_cuda_linux: + name: ik CUDA Linux + uses: ./.github/workflows/build-portable-release-ik-cuda.yml + with: + version: ${{ inputs.version }} + config: 'os:ubuntu-22.04' + + build_release_ik_cpu_windows: + name: ik CPU Windows + uses: ./.github/workflows/build-portable-release-ik.yml + with: + version: ${{ inputs.version }} + config: 'os:windows-2022' + + build_release_ik_cpu_linux: + name: ik CPU Linux + uses: ./.github/workflows/build-portable-release-ik.yml + with: + version: ${{ inputs.version }} + config: 'os:ubuntu-22.04' + + build_release_ik_macos: + name: ik macOS + uses: ./.github/workflows/build-portable-release-ik.yml + with: + version: ${{ inputs.version }} + config: 'os:macos-14' diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml new file mode 100644 index 00000000..40b4b92f --- /dev/null +++ b/.github/workflows/build-portable-release-ik-cuda.yml @@ -0,0 +1,179 @@ +name: Build ik CUDA + +on: + workflow_dispatch: + inputs: + version: + description: 'Version tag of text-generation-webui to build: v3.0' + default: 'v3.0' + required: true + type: string + config: + description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2' + default: 'Default' + required: false + type: string + exclude: + description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2' + default: 'None' + required: false + type: string + workflow_call: + inputs: + version: + description: 'Version tag of text-generation-webui to build: v3.0' + default: 'v3.0' + required: true + type: string + config: + description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2' + default: 'Default' + required: false + type: string + exclude: + description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2' + default: 'None' + required: false + type: string + +permissions: + contents: write + +jobs: + define_matrix: + name: Define Build Matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + defaults: + run: + shell: pwsh + env: + CONFIGIN: ${{ inputs.config }} + EXCLUDEIN: ${{ inputs.exclude }} + + steps: + - name: Define Job Output + id: set-matrix + run: | + $matrix = @{ + 'os' = @('ubuntu-22.04', 'windows-2022') + 'pyver' = @("3.13") + 'cuda' = @("12.4", "13.1") + } + + if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})} + + if ($env:EXCLUDEIN -ne 'None') { + $exclusions = @() + $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData + $matrix['exclude'] = $exclusions + } + + $matrixOut = ConvertTo-Json $matrix -Compress + Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT + + build_wheels: + name: ${{ matrix.os }} ${{ matrix.pyver }} CUDA ${{ matrix.cuda }} + needs: define_matrix + runs-on: ${{ matrix.os }} + strategy: + matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} + defaults: + run: + shell: pwsh + env: + PCKGVER: ${{ inputs.version }} + + steps: + - uses: actions/checkout@v6 + with: + repository: 'oobabooga/text-generation-webui' + ref: ${{ inputs.version }} + submodules: 'recursive' + + - uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.pyver }} + + - name: Build Package + shell: bash + run: | + VERSION_CLEAN="${{ inputs.version }}" + VERSION_CLEAN="${VERSION_CLEAN#v}" + cd .. + cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}" + cd "text-generation-webui-${VERSION_CLEAN}" + + # Remove extensions that need additional requirements + allowed=("character_bias" "gallery" "sd_api_pictures") + find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf + + # Define common variables + CUDA_VERSION="${{ matrix.cuda }}" + VERSION="${{ inputs.version }}" + + # 1. Set platform-specific variables + if [[ "$RUNNER_OS" == "Windows" ]]; then + PLATFORM="windows" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz" + PIP_PATH="portable_env/python.exe -m pip" + PACKAGES_PATH="portable_env/Lib/site-packages" + rm start_linux.sh start_macos.sh + else + PLATFORM="linux" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" + PIP_PATH="portable_env/bin/python -m pip" + PACKAGES_PATH="portable_env/lib/python3.13/site-packages" + rm start_macos.sh start_windows.bat + fi + + # 2. Download and extract Python + cd .. + echo "Downloading Python for $PLATFORM..." + curl -L -o python-build.tar.gz "$PYTHON_URL" + tar -xzf python-build.tar.gz + mv python "text-generation-webui-${VERSION_CLEAN}/portable_env" + + # 3. Prepare requirements file based on CUDA version + cd "text-generation-webui-${VERSION_CLEAN}" + if [[ "$CUDA_VERSION" == "13.1" ]]; then + REQ_FILE="requirements/portable/requirements_cuda131.txt" + else + REQ_FILE="requirements/portable/requirements.txt" + fi + + # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts + sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE" + sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true + + # 5. Install packages + echo "Installing Python packages from $REQ_FILE..." + $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE" + + # 6. Clean up + rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py + + # 7. Create archive + cd .. + if [[ "$RUNNER_OS" == "Windows" ]]; then + ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip" + echo "Creating archive: $ARCHIVE_NAME" + powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME" + else + ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.tar.gz" + echo "Creating archive: $ARCHIVE_NAME" + tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}" + fi + + - name: Upload files to a GitHub release + id: upload-release + uses: svenstaro/upload-release-action@2.7.0 + continue-on-error: true + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: ../textgen-portable-ik-* + tag: ${{ inputs.version }} + file_glob: true + make_latest: false + overwrite: true diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml new file mode 100644 index 00000000..afb2e763 --- /dev/null +++ b/.github/workflows/build-portable-release-ik.yml @@ -0,0 +1,205 @@ +name: Build ik CPU and macOS + +on: + workflow_dispatch: + inputs: + version: + description: 'Version tag of text-generation-webui to build: v3.0' + default: 'v3.0' + required: true + type: string + config: + description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2' + default: 'Default' + required: false + type: string + exclude: + description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2' + default: 'None' + required: false + type: string + workflow_call: + inputs: + version: + description: 'Version tag of text-generation-webui to build: v3.0' + default: 'v3.0' + required: true + type: string + config: + description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2' + default: 'Default' + required: false + type: string + exclude: + description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2' + default: 'None' + required: false + type: string + +permissions: + contents: write + +jobs: + define_matrix: + name: Define Build Matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + defaults: + run: + shell: pwsh + env: + CONFIGIN: ${{ inputs.config }} + EXCLUDEIN: ${{ inputs.exclude }} + + steps: + - name: Define Job Output + id: set-matrix + run: | + $matrix = @{ + 'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14') + 'pyver' = @("3.13") + } + + if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})} + + if ($env:EXCLUDEIN -ne 'None') { + $exclusions = @() + $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData + $matrix['exclude'] = $exclusions + } + + $matrixOut = ConvertTo-Json $matrix -Compress + Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT + + build_wheels: + name: ${{ matrix.os }} ${{ matrix.pyver }} + needs: define_matrix + runs-on: ${{ matrix.os }} + strategy: + matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} + defaults: + run: + shell: pwsh + env: + PCKGVER: ${{ inputs.version }} + + steps: + - uses: actions/checkout@v6 + with: + repository: 'oobabooga/text-generation-webui' + ref: ${{ inputs.version }} + submodules: 'recursive' + + - uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.pyver }} + + - name: Build Package + shell: bash + run: | + VERSION_CLEAN="${{ inputs.version }}" + VERSION_CLEAN="${VERSION_CLEAN#v}" + cd .. + cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}" + cd "text-generation-webui-${VERSION_CLEAN}" + + # Remove extensions that need additional requirements + allowed=("character_bias" "gallery" "sd_api_pictures") + find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf + + # Define common variables + VERSION="${{ inputs.version }}" + OS_TYPE="${{ matrix.os }}" + + # 1. Set platform-specific variables + if [[ "$RUNNER_OS" == "Windows" ]]; then + PLATFORM="windows-cpu" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz" + PIP_PATH="portable_env/python.exe -m pip" + PACKAGES_PATH="portable_env/Lib/site-packages" + rm start_linux.sh start_macos.sh + elif [[ "$RUNNER_OS" == "macOS" ]]; then + if [[ "$OS_TYPE" == "macos-15-intel" ]]; then + PLATFORM="macos-x86_64" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz" + REQ_TYPE="apple_intel" + else + PLATFORM="macos-arm64" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz" + REQ_TYPE="apple_silicon" + fi + PIP_PATH="portable_env/bin/python -m pip" + PACKAGES_PATH="portable_env/lib/python3.13/site-packages" + rm start_linux.sh start_windows.bat + else + # Linux case + PLATFORM="linux-cpu" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" + PIP_PATH="portable_env/bin/python -m pip" + PACKAGES_PATH="portable_env/lib/python3.13/site-packages" + rm start_macos.sh start_windows.bat + fi + + # 2. Download and extract Python + echo "Downloading Python for $PLATFORM..." + cd .. + curl -L -o python-build.tar.gz "$PYTHON_URL" + tar -xzf python-build.tar.gz + mv python "text-generation-webui-${VERSION_CLEAN}/portable_env" + + # 3. Prepare requirements file based on platform + cd "text-generation-webui-${VERSION_CLEAN}" + + # Select requirements file based on platform + if [[ "$RUNNER_OS" == "macOS" ]]; then + if [[ "$OS_TYPE" == "macos-15-intel" ]]; then + REQ_FILE="requirements/portable/requirements_apple_intel.txt" + else + REQ_FILE="requirements/portable/requirements_apple_silicon.txt" + fi + else + REQ_FILE="requirements/portable/requirements_cpu_only.txt" + fi + + echo "Using requirements file: $REQ_FILE" + + # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts + if [[ "$RUNNER_OS" == "macOS" ]]; then + sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE" + sed -i '' 's/--portable/--portable --ik/g' start_macos.sh + else + sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE" + sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true + fi + + # 5. Install packages + echo "Installing Python packages from $REQ_FILE..." + $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE" + + # 6. Clean up + rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py + + # 7. Create archive + cd .. + if [[ "$RUNNER_OS" == "Windows" ]]; then + ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.zip" + echo "Creating archive: $ARCHIVE_NAME" + powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME" + else + ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.tar.gz" + echo "Creating archive: $ARCHIVE_NAME" + tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}" + fi + + - name: Upload files to a GitHub release + id: upload-release + uses: svenstaro/upload-release-action@2.7.0 + continue-on-error: true + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: ../textgen-portable-ik-* + tag: ${{ inputs.version }} + file_glob: true + make_latest: false + overwrite: true diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 9b9756a9..5e2decfa 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -11,7 +11,6 @@ import time from pathlib import Path from typing import Any, List -import llama_cpp_binaries import requests from modules import shared @@ -357,7 +356,16 @@ class LlamaServer: """Start the llama.cpp server and wait until it's ready.""" # Determine the server path if self.server_path is None: - self.server_path = llama_cpp_binaries.get_binary_path() + if shared.args.ik: + try: + import ik_llama_cpp_binaries + except ImportError: + raise ImportError("--ik requires the ik_llama_cpp_binaries package. Install it with: pip install ") + + self.server_path = ik_llama_cpp_binaries.get_binary_path() + else: + import llama_cpp_binaries + self.server_path = llama_cpp_binaries.get_binary_path() # Build the command cmd = [ @@ -616,10 +624,12 @@ def filter_stderr_with_progress(process_stderr): def _patch_cmd_for_ik(cmd): """ Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents: - --no-webui → --webui none + --no-webui → --webui none --fit off → (removed) --fit on / --fit-ctx → --fit (bare flag) --fit-target → --fit-margin + --cache-reuse → (removed, unsupported) + --swa-full → (removed, unsupported) """ patched = [] i = 0 @@ -635,9 +645,14 @@ def _patch_cmd_for_ik(cmd): patched.append("--fit") # "off" → drop entirely elif arg == "--fit-ctx": + patched.append("--fit") i += 1 # skip the value elif arg == "--fit-target": patched.append("--fit-margin") + elif arg == "--cache-reuse": + i += 1 # skip the value + elif arg == "--swa-full": + pass # bare flag, just drop it else: patched.append(arg) diff --git a/modules/loaders.py b/modules/loaders.py index c90f2ebb..cb1f3d3b 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -20,6 +20,7 @@ loaders_and_params = OrderedDict({ 'no_mmap', 'mlock', 'numa', + 'ik', 'parallel', 'model_draft', 'draft_max', @@ -345,6 +346,7 @@ def list_model_elements(): 'spec_ngram_size_m', 'spec_ngram_min_hits', 'mmproj', + 'ik', ] diff --git a/modules/shared.py b/modules/shared.py index c50736d7..13843f0c 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -110,7 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.') group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.') group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"') -group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside /lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.') +group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. Requires the ik_llama_cpp_binaries package to be installed.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 5b7621a7..16505afa 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -51,6 +51,9 @@ def create_ui(): with gr.Column(): shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) + if not shared.args.portable: + shared.gradio['ik'] = gr.Checkbox(label="ik", value=shared.args.ik, info='Use ik_llama.cpp instead of upstream llama.cpp.') + shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.') shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 56619627..100c99d1 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -40,8 +40,10 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13" https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 620683cc..66fa4ac7 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -37,5 +37,5 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index b1f109b2..98dc8be6 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -37,4 +37,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index a54476a9..e33264cf 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -37,4 +37,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index be82c904..cd083f6d 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -37,5 +37,7 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 188da380..67182225 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index 4562b6d0..5f5b2f8d 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 04dcf25e..f5f7d6ee 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -23,4 +23,4 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 4b8af78a..e51fc296 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -23,4 +23,4 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 5b0eaf89..683f94c8 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt index 90b3234f..942d0877 100644 --- a/requirements/portable/requirements_cuda131.txt +++ b/requirements/portable/requirements_cuda131.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index ea72b4ec..ae784e00 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # Vulkan wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From be6fc0663ac1b7a60b7fde24afb38de2b0aba57b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 28 Mar 2026 08:11:28 -0700 Subject: [PATCH 09/27] Update the custom gradio wheels --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 4 ++-- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_nowheels.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_amd.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 4 ++-- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cuda131.txt | 4 ++-- requirements/portable/requirements_nowheels.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- 14 files changed, 28 insertions(+), 28 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 100c99d1..6e11dd2f 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -31,8 +31,8 @@ tqdm wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 66fa4ac7..c964eff6 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -28,8 +28,8 @@ trafilatura==2.0.0 wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 98dc8be6..b1dd6a4f 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -28,8 +28,8 @@ trafilatura==2.0.0 wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index e33264cf..4d03d280 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -28,8 +28,8 @@ trafilatura==2.0.0 wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index cd083f6d..9d41d069 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -28,8 +28,8 @@ trafilatura==2.0.0 wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 77c254e6..052085cc 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -28,8 +28,8 @@ trafilatura==2.0.0 wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 67182225..ff80b6c8 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index 5f5b2f8d..318044da 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index f5f7d6ee..1676bffb 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index e51fc296..27fc2da8 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 683f94c8..0bbdd30a 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt index 942d0877..c3ae3c57 100644 --- a/requirements/portable/requirements_cuda131.txt +++ b/requirements/portable/requirements_cuda131.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index e8457909..e38140ce 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index ae784e00..e646c04c 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl # API flask_cloudflared==0.0.15 From 0466b6e2714a05c04eff0c929f15e4679f029e8d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 29 Mar 2026 15:52:36 -0700 Subject: [PATCH 10/27] ik_llama.cpp: Auto-enable Hadamard KV cache rotation with quantized cache --- modules/llama_cpp_server.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 5e2decfa..fa968be1 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -631,6 +631,12 @@ def _patch_cmd_for_ik(cmd): --cache-reuse → (removed, unsupported) --swa-full → (removed, unsupported) """ + # Add Hadamard KV cache rotation when using quantized cache types. + # This significantly improves quantized cache quality (especially q4_0) + # and is a no-op for MLA models like DeepSeek. + if shared.args.cache_type in ("q8_0", "q4_0"): + cmd += ["-khad", "-vhad"] + patched = [] i = 0 while i < len(cmd): From 6382fbef8381bf60ff909b4fd76e7c1f4c063afc Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 30 Mar 2026 17:44:19 -0700 Subject: [PATCH 11/27] Several small code simplifications --- download-model.py | 25 +++--- js/dark_theme.js | 12 ++- js/global_scope_js.js | 79 +++++++++--------- js/main.js | 171 +++++++++++++-------------------------- js/save_files.js | 18 ++--- js/show_controls.js | 21 ++--- js/switch_tabs.js | 24 ++---- js/update_big_picture.js | 3 +- modules/extensions.py | 22 +++-- 9 files changed, 140 insertions(+), 235 deletions(-) diff --git a/download-model.py b/download-model.py index 95d25e16..a31bbfc6 100644 --- a/download-model.py +++ b/download-model.py @@ -158,28 +158,21 @@ class ModelDownloader: # Also if GGUF and safetensors are available, download only safetensors if (has_pytorch or has_pt or has_gguf) and has_safetensors: has_gguf = False - for i in range(len(classifications) - 1, -1, -1): - if classifications[i] in ['pytorch', 'pt', 'gguf']: - links.pop(i) - file_sizes.pop(i) + keep = [i for i, c in enumerate(classifications) if c not in ['pytorch', 'pt', 'gguf']] + links = [links[i] for i in keep] + file_sizes = [file_sizes[i] for i in keep] # For GGUF, try to download only the Q4_K_M if no specific file is specified. if has_gguf and specific_file is None: - has_q4km = False - for i in range(len(classifications) - 1, -1, -1): - if 'q4_k_m' in links[i].lower(): - has_q4km = True + has_q4km = any('q4_k_m' in link.lower() for link in links) if has_q4km: - for i in range(len(classifications) - 1, -1, -1): - if 'q4_k_m' not in links[i].lower(): - links.pop(i) - file_sizes.pop(i) + keep = [i for i, link in enumerate(links) if 'q4_k_m' in link.lower()] else: - for i in range(len(classifications) - 1, -1, -1): - if links[i].lower().endswith('.gguf'): - links.pop(i) - file_sizes.pop(i) + keep = [i for i, link in enumerate(links) if not link.lower().endswith('.gguf')] + + links = [links[i] for i in keep] + file_sizes = [file_sizes[i] for i in keep] is_llamacpp = has_gguf and specific_file is not None return links, sha256, is_lora, is_llamacpp, file_sizes diff --git a/js/dark_theme.js b/js/dark_theme.js index 7136f5bf..9d7069e2 100644 --- a/js/dark_theme.js +++ b/js/dark_theme.js @@ -1,6 +1,6 @@ function toggleDarkMode() { document.body.classList.toggle("dark"); - var currentCSS = document.getElementById("highlight-css"); + const currentCSS = document.getElementById("highlight-css"); if (currentCSS.getAttribute("href") === "file/css/highlightjs/github-dark.min.css") { currentCSS.setAttribute("href", "file/css/highlightjs/github.min.css"); } else { @@ -9,12 +9,10 @@ function toggleDarkMode() { // Re-highlight all code blocks once stylesheet loads currentCSS.onload = function() { - const messageBodies = document.getElementById("chat").querySelectorAll(".message-body"); - messageBodies.forEach((messageBody) => { - const codeBlocks = messageBody.querySelectorAll("pre code"); - codeBlocks.forEach((codeBlock) => { - hljs.highlightElement(codeBlock); - }); + // Clear data-highlighted so hljs will re-process with the new theme + document.querySelectorAll("#chat .message-body pre code[data-highlighted]").forEach((codeBlock) => { + delete codeBlock.dataset.highlighted; }); + doSyntaxHighlighting(); }; } diff --git a/js/global_scope_js.js b/js/global_scope_js.js index 92f65622..20eeef66 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -1,11 +1,35 @@ +// ------------------------------------------------- +// Shared helpers +// ------------------------------------------------- + +function getProfilePictureUrl() { + return "/file/user_data/cache/pfp_character.png?time=" + Date.now(); +} + +const MESSAGE_SELECTOR = ".message, .user-message, .assistant-message"; + +function getMessageElement(element) { + if (!element) return null; + return element.closest(MESSAGE_SELECTOR); +} + +function isUserRole(messageElement) { + return messageElement.classList.contains("user-message") || + messageElement.querySelector(".text-you") !== null || + messageElement.querySelector(".circle-you") !== null; +} + +// Trigger a synthetic 'input' event so Gradio picks up programmatic value changes +function dispatchGradioInput(element) { + element.dispatchEvent(new Event("input", { bubbles: true })); +} + // ------------------------------------------------- // Event handlers // ------------------------------------------------- function copyToClipboard(element) { - if (!element) return; - - const messageElement = element.closest(".message, .user-message, .assistant-message"); + const messageElement = getMessageElement(element); if (!messageElement) return; const rawText = messageElement.getAttribute("data-raw"); @@ -48,9 +72,7 @@ function fallbackCopyToClipboard(text) { } function branchHere(element) { - if (!element) return; - - const messageElement = element.closest(".message, .user-message, .assistant-message"); + const messageElement = getMessageElement(element); if (!messageElement) return; const index = messageElement.getAttribute("data-index"); @@ -69,11 +91,7 @@ function branchHere(element) { } branchIndexInput.value = index; - - // Trigger any 'change' or 'input' events Gradio might be listening for - const event = new Event("input", { bubbles: true }); - branchIndexInput.dispatchEvent(event); - + dispatchGradioInput(branchIndexInput); branchButton.click(); } @@ -82,9 +100,7 @@ function branchHere(element) { // ------------------------------------------------- function editHere(buttonElement) { - if (!buttonElement) return; - - const messageElement = buttonElement.closest(".message, .user-message, .assistant-message"); + const messageElement = getMessageElement(buttonElement); if (!messageElement) return; const messageBody = messageElement.querySelector(".message-body"); @@ -97,12 +113,7 @@ function editHere(buttonElement) { return; } - // Determine role based on message element - handle different chat modes - const isUserMessage = messageElement.classList.contains("user-message") || - messageElement.querySelector(".text-you") !== null || - messageElement.querySelector(".circle-you") !== null; - - startEditing(messageElement, messageBody, isUserMessage); + startEditing(messageElement, messageBody, isUserRole(messageElement)); } function startEditing(messageElement, messageBody, isUserMessage) { @@ -209,30 +220,22 @@ function submitMessageEdit(index, newText, isUserMessage) { editTextInput.value = newText; editRoleInput.value = isUserMessage ? "user" : "assistant"; - editIndexInput.dispatchEvent(new Event("input", { bubbles: true })); - editTextInput.dispatchEvent(new Event("input", { bubbles: true })); - editRoleInput.dispatchEvent(new Event("input", { bubbles: true })); + dispatchGradioInput(editIndexInput); + dispatchGradioInput(editTextInput); + dispatchGradioInput(editRoleInput); editButton.click(); return true; } function navigateVersion(element, direction) { - if (!element) return; - - const messageElement = element.closest(".message, .user-message, .assistant-message"); + const messageElement = getMessageElement(element); if (!messageElement) return; const index = messageElement.getAttribute("data-index"); if (!index) return; - // Determine role based on message element classes - let role = "assistant"; // Default role - if (messageElement.classList.contains("user-message") || - messageElement.querySelector(".text-you") || - messageElement.querySelector(".circle-you")) { - role = "user"; - } + const role = isUserRole(messageElement) ? "user" : "assistant"; const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input"); const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea"); @@ -248,11 +251,9 @@ function navigateVersion(element, direction) { directionInput.value = direction; roleInput.value = role; - // Trigger 'input' events for Gradio to pick up changes - const event = new Event("input", { bubbles: true }); - indexInput.dispatchEvent(event); - directionInput.dispatchEvent(event); - roleInput.dispatchEvent(event); + dispatchGradioInput(indexInput); + dispatchGradioInput(directionInput); + dispatchGradioInput(roleInput); navigateButton.click(); } @@ -313,7 +314,7 @@ function handleMorphdomUpdate(data) { function applyMorphdomUpdate(data) { // Determine target element and use it as query scope - var target_element, target_html; + let target_element, target_html; if (data.last_message_only) { const childNodes = document.getElementsByClassName("messages")[0].childNodes; target_element = childNodes[childNodes.length - 1]; diff --git a/js/main.js b/js/main.js index f05f93c6..cba4c903 100644 --- a/js/main.js +++ b/js/main.js @@ -4,8 +4,9 @@ // Sync highlight.js theme with the actual Gradio theme var defined_hljs_css = document.body.classList.contains("dark") ? "file/css/highlightjs/github-dark.min.css" : "file/css/highlightjs/github.min.css"; -if (document.getElementById("highlight-css").getAttribute("href") !== defined_hljs_css) { - document.getElementById("highlight-css").setAttribute("href", defined_hljs_css); +var hljsCssElement = document.getElementById("highlight-css"); +if (hljsCssElement.getAttribute("href") !== defined_hljs_css) { + hljsCssElement.setAttribute("href", defined_hljs_css); } let main_parent = document.getElementById("chat-tab").parentNode; @@ -49,21 +50,18 @@ document.querySelector(".header_bar").addEventListener("click", function(event) //------------------------------------------------ // --- Helper functions --- // -function isModifiedKeyboardEvent() { - return (event instanceof KeyboardEvent && - event.shiftKey || - event.ctrlKey || - event.altKey || - event.metaKey); +function isModifiedKeyboardEvent(event) { + return event instanceof KeyboardEvent && + (event.shiftKey || event.ctrlKey || event.altKey || event.metaKey); } -function isFocusedOnEditableTextbox() { +function isFocusedOnEditableTextbox(event) { if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") { return !!event.target.value; } + return false; } -let previousTabId = "chat-tab-button"; document.addEventListener("keydown", function(event) { // Stop generation on Esc pressed if (event.key === "Escape") { @@ -117,14 +115,14 @@ document.addEventListener("keydown", function(event) { } // --- Simple version navigation --- // - if (!isFocusedOnEditableTextbox()) { + if (!isFocusedOnEditableTextbox(event)) { // Version navigation on Arrow keys (horizontal) - if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") { + if (!isModifiedKeyboardEvent(event) && event.key === "ArrowLeft") { event.preventDefault(); navigateLastAssistantMessage("left"); } - else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") { + else if (!isModifiedKeyboardEvent(event) && event.key === "ArrowRight") { event.preventDefault(); if (!navigateLastAssistantMessage("right")) { // If can't navigate right (last version), regenerate @@ -159,9 +157,8 @@ targetElement.addEventListener("scroll", function() { let diff = targetElement.scrollHeight - targetElement.clientHeight; let isAtBottomNow = Math.abs(targetElement.scrollTop - diff) <= 10 || diff <= 0; - // Add scrolling class to disable hover effects if (window.isScrolled || !isAtBottomNow) { - targetElement.classList.add("scrolling"); + targetElement.classList.add("scrolling"); // Disables hover effects during scroll } if(isAtBottomNow) { @@ -202,12 +199,8 @@ const observer = new MutationObserver(function() { }); // Only watch for attribute changes on targetElement (e.g. _generating class) -const config = { - attributes: true -}; - // Start observing the target element -observer.observe(targetElement, config); +observer.observe(targetElement, { attributes: true }); //------------------------------------------------ // Handle syntax highlighting / LaTeX @@ -228,7 +221,7 @@ window.doSyntaxHighlighting = function() { if (messageBodies.length > 0) { let hasSeenVisible = false; - // Go from last message to first + // Go from last message to first so we can early-exit once past visible area for (let i = messageBodies.length - 1; i >= 0; i--) { const messageBody = messageBodies[i]; @@ -243,8 +236,8 @@ window.doSyntaxHighlighting = function() { codeBlock.classList.add("pretty_scrollbar"); }); - // Only render math in visible elements const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt"); + // Only render math in individually visible containers (the outer check is on the message body) mathContainers.forEach(container => { if (isElementVisibleOnScreen(container)) { renderMathInElement(container, { @@ -271,7 +264,7 @@ const doSyntaxHighlighting = window.doSyntaxHighlighting; // Add some scrollbars //------------------------------------------------ const scrollbarElements = document.querySelectorAll(".add_scrollbar textarea, .add_scrollbar .drag-drop-list"); -for(i = 0; i < scrollbarElements.length; i++) { +for(let i = 0; i < scrollbarElements.length; i++) { scrollbarElements[i].classList.remove("scroll-hide"); scrollbarElements[i].classList.add("pretty_scrollbar"); scrollbarElements[i].style.resize = "none"; @@ -298,13 +291,13 @@ if (toolsInfo) { // Remove some backgrounds //------------------------------------------------ const noBackgroundelements = document.querySelectorAll(".no-background"); -for(i = 0; i < noBackgroundelements.length; i++) { +for(let i = 0; i < noBackgroundelements.length; i++) { noBackgroundelements[i].parentNode.style.border = "none"; noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = "center"; } const slimDropdownElements = document.querySelectorAll(".slim-dropdown"); -for (i = 0; i < slimDropdownElements.length; i++) { +for (let i = 0; i < slimDropdownElements.length; i++) { const parentNode = slimDropdownElements[i].parentNode; parentNode.style.background = "transparent"; parentNode.style.border = "0"; @@ -374,49 +367,43 @@ button.addEventListener("click", function () { } }); -// Add event listener for mouseleave on the button -button.addEventListener("mouseleave", function () { - // Delay to prevent menu hiding when the mouse leaves the button into the menu +// Delay to prevent menu hiding when the mouse leaves the button or menu +function delayedHideMenu() { setTimeout(function () { if (!isMouseOverButtonOrMenu()) { hideMenu(); } }, 100); -}); +} +// Add event listener for mouseleave on the button +button.addEventListener("mouseleave", delayedHideMenu); // Add event listener for mouseleave on the menu -menu.addEventListener("mouseleave", function () { - // Delay to prevent menu hide when the mouse leaves the menu into the button - setTimeout(function () { - if (!isMouseOverButtonOrMenu()) { - hideMenu(); - } - }, 100); -}); +menu.addEventListener("mouseleave", delayedHideMenu); // Add event listener for click anywhere in the document document.addEventListener("click", function (event) { - const target = event.target; - // Check if the click is outside the button/menu and the menu is visible if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") { hideMenu(); } - if (event.target.classList.contains("pfp_character")) { + const target = event.target; + + if (target.classList.contains("pfp_character")) { toggleBigPicture(); } // Handle sidebar clicks on mobile if (isMobile()) { - // Check if the click did NOT originate from any of the specified toggle buttons or elements + // Check if the click did NOT originate from any of the specified toggle buttons or elements if ( target.closest("#navigation-toggle") !== navigationToggle && - target.closest("#past-chats-toggle") !== pastChatsToggle && - target.closest("#chat-controls-toggle") !== chatControlsToggle && - target.closest(".header_bar") !== headerBar && - target.closest("#past-chats-row") !== pastChatsRow && - target.closest("#chat-controls") !== chatControlsRow + target.closest("#past-chats-toggle") !== pastChatsToggle && + target.closest("#chat-controls-toggle") !== chatControlsToggle && + target.closest(".header_bar") !== headerBar && + target.closest("#past-chats-row") !== pastChatsRow && + target.closest("#chat-controls") !== chatControlsRow ) { handleIndividualSidebarClose(event); } @@ -433,27 +420,19 @@ document.getElementById("chat-input-row").classList.add("chat-input-positioned") //------------------------------------------------ const chatTextArea = document.getElementById("chat-input").querySelector("textarea"); -function respondToChatInputVisibility(element, callback) { - var options = { - root: document.documentElement, - }; - - var observer = new IntersectionObserver((entries, observer) => { +function focusOnVisible(element) { + var observer = new IntersectionObserver((entries) => { entries.forEach(entry => { - callback(entry.intersectionRatio > 0); + if (entry.intersectionRatio > 0) { + element.focus(); + } }); - }, options); + }, { root: document.documentElement }); observer.observe(element); } -function handleChatInputVisibilityChange(isVisible) { - if (isVisible) { - chatTextArea.focus(); - } -} - -respondToChatInputVisibility(chatTextArea, handleChatInputVisibilityChange); +focusOnVisible(chatTextArea); //------------------------------------------------ // Show enlarged character picture when the profile @@ -463,8 +442,7 @@ let bigPictureVisible = false; function addBigPicture() { var imgElement = document.createElement("img"); - var timestamp = new Date().getTime(); - imgElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp; + imgElement.src = getProfilePictureUrl(); imgElement.classList.add("bigProfilePicture"); imgElement.addEventListener("load", function () { this.style.visibility = "visible"; @@ -478,9 +456,8 @@ function addBigPicture() { } function deleteBigPicture() { - var bigProfilePictures = document.querySelectorAll(".bigProfilePicture"); - bigProfilePictures.forEach(function (element) { - element.parentNode.removeChild(element); + document.querySelectorAll(".bigProfilePicture").forEach(function (element) { + element.remove(); }); } @@ -494,44 +471,11 @@ function toggleBigPicture() { } } -//------------------------------------------------ -// Handle the chat input box growth -//------------------------------------------------ - -// Cache DOM elements -const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode; -const chatInput = document.querySelector("#chat-input textarea"); - -// Variables to store current dimensions -let currentChatInputHeight = chatInput.clientHeight; - //------------------------------------------------ // Focus on the rename text area when it becomes visible //------------------------------------------------ const renameTextArea = document.getElementById("rename-row").querySelector("textarea"); - -function respondToRenameVisibility(element, callback) { - var options = { - root: document.documentElement, - }; - - var observer = new IntersectionObserver((entries, observer) => { - entries.forEach(entry => { - callback(entry.intersectionRatio > 0); - }); - }, options); - - observer.observe(element); -} - - -function handleVisibilityChange(isVisible) { - if (isVisible) { - renameTextArea.focus(); - } -} - -respondToRenameVisibility(renameTextArea, handleVisibilityChange); +focusOnVisible(renameTextArea); //------------------------------------------------ // Adjust the chat tab margin if no extension UI @@ -737,21 +681,21 @@ function handleIndividualSidebarClose(event) { // Close navigation bar if click is outside and it is open if (!headerBar.contains(target) && !headerBar.classList.contains("sidebar-hidden")) { - toggleSidebar(headerBar, navigationToggle, true); + toggleSidebar(headerBar, navigationToggle); } // Close past chats row if click is outside and it is open if (!pastChatsRow.contains(target) && !pastChatsRow.classList.contains("sidebar-hidden")) { - toggleSidebar(pastChatsRow, pastChatsToggle, true); + toggleSidebar(pastChatsRow, pastChatsToggle); } // Close chat controls row if click is outside and it is open if (!chatControlsRow.contains(target) && !chatControlsRow.classList.contains("sidebar-hidden")) { - toggleSidebar(chatControlsRow, chatControlsToggle, true); + toggleSidebar(chatControlsRow, chatControlsToggle); } } -function toggleSidebar(sidebar, toggle, forceClose = false) { +function toggleSidebar(sidebar, toggle) { const isCurrentlyHidden = sidebar.classList.contains("sidebar-hidden"); const shouldClose = !isCurrentlyHidden; @@ -776,11 +720,6 @@ function toggleSidebar(sidebar, toggle, forceClose = false) { toggle.classList.toggle("chat-controls-open", !shouldClose); toggle.innerHTML = shouldClose ? leftArrowSVG : rightArrowSVG; } - - // Mobile handling - if (isMobile()) { - sidebar.classList.toggle("sidebar-shown", !shouldClose); - } } // Function to check if the device is mobile @@ -840,17 +779,17 @@ pastChatsToggle.addEventListener("click", () => { const isCurrentlyOpen = !pastChatsRow.classList.contains("sidebar-hidden"); toggleSidebar(pastChatsRow, pastChatsToggle); - // On desktop, open/close both sidebars at the same time + // On desktop, sync both sidebars together if (!isMobile()) { if (isCurrentlyOpen) { // If we just closed the left sidebar, also close the right sidebar if (!chatControlsRow.classList.contains("sidebar-hidden")) { - toggleSidebar(chatControlsRow, chatControlsToggle, true); + toggleSidebar(chatControlsRow, chatControlsToggle); } } else { // If we just opened the left sidebar, also open the right sidebar if (chatControlsRow.classList.contains("sidebar-hidden")) { - toggleSidebar(chatControlsRow, chatControlsToggle, false); + toggleSidebar(chatControlsRow, chatControlsToggle); } } } @@ -860,17 +799,17 @@ chatControlsToggle.addEventListener("click", () => { const isCurrentlyOpen = !chatControlsRow.classList.contains("sidebar-hidden"); toggleSidebar(chatControlsRow, chatControlsToggle); - // On desktop, open/close both sidebars at the same time + // On desktop, sync both sidebars together if (!isMobile()) { if (isCurrentlyOpen) { // If we just closed the right sidebar, also close the left sidebar if (!pastChatsRow.classList.contains("sidebar-hidden")) { - toggleSidebar(pastChatsRow, pastChatsToggle, true); + toggleSidebar(pastChatsRow, pastChatsToggle); } } else { // If we just opened the right sidebar, also open the left sidebar if (pastChatsRow.classList.contains("sidebar-hidden")) { - toggleSidebar(pastChatsRow, pastChatsToggle, false); + toggleSidebar(pastChatsRow, pastChatsToggle); } } } @@ -890,7 +829,7 @@ if (isMobile()) { const textarea = document.querySelector("#chat-input textarea"); if (textarea) { - // Simulate adding and removing a newline + // Force textarea height recalculation by simulating content change textarea.value += "\n"; textarea.dispatchEvent(new Event("input", { bubbles: true })); textarea.value = textarea.value.slice(0, -1); diff --git a/js/save_files.js b/js/save_files.js index bdb0e334..c3cbf9ff 100644 --- a/js/save_files.js +++ b/js/save_files.js @@ -1,10 +1,9 @@ // Functions for downloading JSON files function getCurrentTimestamp() { const now = new Date(); - const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert to milliseconds + const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert minutes to milliseconds const localTime = new Date(now.getTime() - timezoneOffset); - const formattedTimestamp = localTime.toISOString().replace(/[-:]/g, "").slice(0, 15); - return formattedTimestamp; + return localTime.toISOString().replace(/[-:]/g, "").slice(0, 15); } function saveFile(contents, filename) { @@ -18,23 +17,18 @@ function saveFile(contents, filename) { } function saveHistory(history, character, mode) { - let path = null; + let path; if (["chat", "chat-instruct"].includes(mode) && character && character.trim() !== "") { path = `history_${character}_${getCurrentTimestamp()}.json`; } else { - try { - path = `history_${mode}_${getCurrentTimestamp()}.json`; - } catch (error) { - path = `history_${getCurrentTimestamp()}.json`; - } + path = `history_${mode || "unknown"}_${getCurrentTimestamp()}.json`; } + saveFile(history, path); } function saveSession(session) { - let path = null; - - path = `session_${getCurrentTimestamp()}.json`; + const path = `session_${getCurrentTimestamp()}.json`; saveFile(session, path); } diff --git a/js/show_controls.js b/js/show_controls.js index ff513395..d5642dc4 100644 --- a/js/show_controls.js +++ b/js/show_controls.js @@ -1,13 +1,11 @@ -const chatParent = document.querySelector(".chat-parent"); - function toggle_controls(value) { + const navToggle = document.getElementById("navigation-toggle"); + const pastChatsToggle = document.getElementById("past-chats-toggle"); const extensions = document.querySelector("#extensions"); + const galleryExtension = document.getElementById("gallery-extension"); if (value) { // SHOW MODE: Click toggles to show hidden sidebars - const navToggle = document.getElementById("navigation-toggle"); - const pastChatsToggle = document.getElementById("past-chats-toggle"); - if (navToggle && document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) { navToggle.click(); } @@ -19,17 +17,11 @@ function toggle_controls(value) { if (extensions) { extensions.style.display = "inherit"; } - - let gallery_element = document.getElementById("gallery-extension"); - if (gallery_element) { - gallery_element.style.display = "block"; + if (galleryExtension) { + galleryExtension.style.display = "block"; } - } else { // HIDE MODE: Click toggles to hide visible sidebars - const navToggle = document.getElementById("navigation-toggle"); - const pastChatsToggle = document.getElementById("past-chats-toggle"); - if (navToggle && !document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) { navToggle.click(); } @@ -41,5 +33,8 @@ function toggle_controls(value) { if (extensions) { extensions.style.display = "none"; } + if (galleryExtension) { + galleryExtension.style.display = "none"; + } } } diff --git a/js/switch_tabs.js b/js/switch_tabs.js index 36e5736b..a1b44ef3 100644 --- a/js/switch_tabs.js +++ b/js/switch_tabs.js @@ -2,17 +2,9 @@ function scrollToTop() { window.scrollTo({ top: 0 }); } -function findButtonsByText(buttonText) { - const buttons = document.getElementsByTagName("button"); - const matchingButtons = []; - - for (let i = 0; i < buttons.length; i++) { - if (buttons[i].textContent.trim() === buttonText) { - matchingButtons.push(buttons[i]); - } - } - - return matchingButtons; +function findButtonsByText(buttonText, container = document) { + return Array.from(container.getElementsByTagName("button")) + .filter(btn => btn.textContent.trim() === buttonText); } function switch_to_chat() { @@ -39,13 +31,9 @@ function switch_to_character() { function switch_to_image_ai_generate() { const container = document.querySelector("#image-ai-tab"); - const buttons = container.getElementsByTagName("button"); - - for (let i = 0; i < buttons.length; i++) { - if (buttons[i].textContent.trim() === "Generate") { - buttons[i].click(); - break; - } + const generateBtn = findButtonsByText("Generate", container)[0]; + if (generateBtn) { + generateBtn.click(); } scrollToTop(); diff --git a/js/update_big_picture.js b/js/update_big_picture.js index ec51d63b..8f638c99 100644 --- a/js/update_big_picture.js +++ b/js/update_big_picture.js @@ -1,7 +1,6 @@ function updateBigPicture() { var existingElement = document.querySelector(".bigProfilePicture"); if (existingElement) { - var timestamp = new Date().getTime(); - existingElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp; + existingElement.src = getProfilePictureUrl(); } } diff --git a/modules/extensions.py b/modules/extensions.py index 09db9f40..afe847f0 100644 --- a/modules/extensions.py +++ b/modules/extensions.py @@ -191,21 +191,19 @@ def _apply_custom_generate_reply(): def _apply_custom_css(): - all_css = '' - for extension, _ in iterator(): - if hasattr(extension, 'custom_css'): - all_css += getattr(extension, 'custom_css')() - - return all_css + return ''.join( + getattr(extension, 'custom_css')() + for extension, _ in iterator() + if hasattr(extension, 'custom_css') + ) def _apply_custom_js(): - all_js = '' - for extension, _ in iterator(): - if hasattr(extension, 'custom_js'): - all_js += getattr(extension, 'custom_js')() - - return all_js + return ''.join( + getattr(extension, 'custom_js')() + for extension, _ in iterator() + if hasattr(extension, 'custom_js') + ) def create_extensions_block(): From 71c1a52afe54ab599ab5849ae80f1d5a3a72fb5a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 30 Mar 2026 20:49:38 -0700 Subject: [PATCH 12/27] API: Implement echo + logprobs for /v1/completions endpoint --- modules/api/completions.py | 299 ++++++++++++++++++++++++++++++------ modules/exllamav3.py | 26 +++- modules/llama_cpp_server.py | 39 ++++- 3 files changed, 309 insertions(+), 55 deletions(-) diff --git a/modules/api/completions.py b/modules/api/completions.py index 8948bb86..587ad6ea 100644 --- a/modules/api/completions.py +++ b/modules/api/completions.py @@ -39,6 +39,129 @@ def load_chat_template_file(filepath): return text +def _first_token_display_str(token_id, prompt, tokenizer): + """Return the display string for the first prompt token. + + Returns empty string for BOS or tokens that don't appear at the start + of the prompt text, so they don't shift text_offset for subsequent tokens. + """ + token_id = int(token_id) + bos_id = getattr(tokenizer, 'bos_token_id', None) + if bos_id is not None and token_id == bos_id: + return "" + + import torch + tok = tokenizer.decode(torch.tensor([token_id])) + if not prompt.startswith(tok): + return "" + + return tok + + +def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None): + """Compute logprob entries for prompt tokens via a forward pass. + + Returns a list of logprob entries in the standard format. + The first token gets a null entry (no conditioning context). + + Supported for HF-compatible loaders (Transformers, ExLlamav3_HF, etc.) + via a single forward pass, and for llama.cpp via the server's + prompt_logprobs parameter. Returns [] for unsupported loaders. + """ + if input_ids is None: + input_ids = encode(prompt) # (1, seq_len) tensor or array + + token_ids = input_ids[0] + n_tokens = len(token_ids) + + if n_tokens == 0: + return [] + + loader = shared.args.loader + model = shared.model + + if loader == 'llama.cpp': + return model.get_prompt_logprob_entries(token_ids, max(logprobs_count, 1), prompt=prompt) + + first_token_str = _first_token_display_str(token_ids[0], prompt, shared.tokenizer) + + if n_tokens <= 1: + return [{"token": first_token_str, "null_logprob": True}] + + import torch + + if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'): + # Native ExLlamav3: call the underlying Model.forward() directly + input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long) + with torch.no_grad(): + logits = model.model.forward( + input_ids=input_ids_tensor, + params={ + "attn_mode": "flash_attn", + "cache": model.cache, + "past_len": 0, + "batch_shape": (1, model.max_tokens), + } + ).float().cpu() + + elif hasattr(model, 'forward'): + # HF-compatible loaders (Transformers, ExLlamav3_HF, etc.) + input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long) + if hasattr(model, 'device'): + input_ids_tensor = input_ids_tensor.to(model.device) + with torch.no_grad(): + # Pass labels to ensure logits are returned for ALL positions, + # not just the last token (some HF wrappers like ExLlamav3_HF + # only compute the last-token logits when labels are absent). + outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor) + logits = outputs.logits.float().cpu() + + else: + return [] + + entries = [{"token": first_token_str, "null_logprob": True}] + + # Batch logsumexp and topk as single operations across all positions + # to avoid per-position kernel launch overhead. + prompt_logits = logits[0, :n_tokens - 1] # positions 0..n-2 predict tokens 1..n-1 + k = min(logprobs_count, prompt_logits.shape[-1]) + all_top_values, all_top_indices = torch.topk(prompt_logits, k=k, dim=-1) + all_lse = torch.logsumexp(prompt_logits, dim=-1) + all_top_log_probs = all_top_values - all_lse.unsqueeze(-1) + + # Batch-decode all unique token IDs to avoid O(N*k) individual decode calls + unique_ids = set(int(tid) for tid in token_ids[1:]) + unique_ids.update(int(tid) for tid in all_top_indices.flatten().tolist()) + + decoded_strs = {tid: shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids} + + for i in range(1, n_tokens): + token_id = int(token_ids[i]) + idx = i - 1 + top_log_probs = all_top_log_probs[idx] + top_ids = all_top_indices[idx].tolist() + actual_token_str = decoded_strs[token_id] + + # Build the top list with the actual prompt token guaranteed at front + if token_id in top_ids: + actual_lp = top_log_probs[top_ids.index(token_id)].item() + alternatives = [ + {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()} + for j in range(k) if top_ids[j] != token_id + ] + else: + actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item() + alternatives = [ + {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()} + for j in range(k - 1) # drop lowest to make room + ] + + entry = {"top_logprobs": [{"token": actual_token_str, "logprob": actual_lp}] + alternatives} + entries.append(entry) + + return entries + + def _get_raw_logprob_entries(offset=0): """Get raw logprob entries from llama.cpp/ExLlamav3 backend, starting from offset. @@ -65,6 +188,21 @@ def _parse_entry_top(entry): return entry.get('top_logprobs', entry.get('top_probs', [])) +def _extract_sampled_token(entry, top): + """Get the actually sampled token and its logprob from a logprob entry. + + Uses the entry-level token/logprob when available (the actually sampled + token), falling back to top[0] (highest-probability alternative) which + may differ with non-greedy sampling. + """ + if 'token' in entry: + return entry['token'], entry.get('logprob', entry.get('prob', 0)) + + token_str = top[0].get('token', '') + token_logprob = top[0].get('logprob', top[0].get('prob', 0)) + return token_str, token_logprob + + def format_chat_logprobs(entries): """Format logprob entries into OpenAI chat completions logprobs format. @@ -79,9 +217,7 @@ def format_chat_logprobs(entries): if not top: continue - chosen = top[0] - token_str = chosen.get('token', '') - token_logprob = chosen.get('logprob', chosen.get('prob', 0)) + token_str, token_logprob = _extract_sampled_token(entry, top) top_list = [] for item in top: @@ -118,13 +254,21 @@ def format_completion_logprobs(entries): offset = 0 for entry in entries: + # Handle null logprob entries (first prompt token with echo) + if entry.get("null_logprob"): + token_str = entry.get("token", "") + tokens.append(token_str) + token_logprobs.append(None) + top_logprobs.append(None) + text_offset.append(offset) + offset += len(token_str) + continue + top = _parse_entry_top(entry) if not top: continue - chosen = top[0] - token_str = chosen.get('token', '') - token_logprob = chosen.get('logprob', chosen.get('prob', 0)) + token_str, token_logprob = _extract_sampled_token(entry, top) tokens.append(token_str) token_logprobs.append(token_logprob) @@ -407,7 +551,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p }) max_tokens = generate_params['max_new_tokens'] - if max_tokens in [None, 0]: + if max_tokens is not None and max_tokens <= 0: + raise InvalidRequestError(message="max_tokens must be greater than 0.", param="max_tokens") + + if max_tokens is None: generate_params['max_new_tokens'] = 512 generate_params['auto_max_new_tokens'] = True @@ -652,6 +799,15 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e # common params generate_params = process_parameters(body, is_legacy=is_legacy) max_tokens = generate_params['max_new_tokens'] + if max_tokens is None: + generate_params['max_new_tokens'] = 512 + generate_params['auto_max_new_tokens'] = True + max_tokens = 512 + elif max_tokens < 0: + raise InvalidRequestError(message="max_tokens must be greater than or equal to 0.", param="max_tokens") + elif max_tokens == 0 and body.get('logprobs') is None: + raise InvalidRequestError(message="max_tokens is 0 but no logprobs parameter was specified.", param="max_tokens") + generate_params['stream'] = stream if stop_event is not None: generate_params['stop_event'] = stop_event @@ -700,9 +856,17 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e prompt = decode(prompt)[0] prefix = prompt if echo else '' - token_count = len(encode(prompt)[0]) + prompt_input_ids = encode(prompt) + token_count = len(prompt_input_ids[0]) total_prompt_token_count += token_count + # Compute prompt logprobs once per prompt (shared across n_completions) + logprobs_val = body.get('logprobs', None) + if echo and logprobs_val is not None and logprobs_val >= 0: + prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids) + else: + prompt_entries = None + original_seed = generate_params.get('seed', -1) for _n in range(n_completions): # Increment seed for each completion to ensure diversity (matches llama.cpp native behavior) @@ -713,29 +877,41 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e logprob_proc.token_alternatives_history.clear() # generate reply ####################################### - debug_msg({'prompt': prompt, 'generate_params': generate_params}) - generator = generate_reply(prompt, generate_params, is_chat=False) - answer = '' - - for a in generator: - answer = a - - completion_token_count = len(encode(answer)[0]) - total_completion_token_count += completion_token_count - stop_reason = "stop" - if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens: - stop_reason = "length" - - if logprob_proc: - all_entries = [] - for alt in logprob_proc.token_alternatives_history: - all_entries.extend(_dict_to_logprob_entries(alt)) - completion_logprobs = format_completion_logprobs(all_entries) - elif shared.args.loader in ('llama.cpp', 'ExLlamav3'): - raw = getattr(shared.model, 'last_completion_probabilities', None) - completion_logprobs = format_completion_logprobs(raw) + if max_tokens == 0: + answer = '' + completion_token_count = 0 + stop_reason = "stop" else: - completion_logprobs = None + debug_msg({'prompt': prompt, 'generate_params': generate_params}) + generator = generate_reply(prompt, generate_params, is_chat=False) + answer = '' + + for a in generator: + answer = a + + completion_token_count = len(encode(answer)[0]) + stop_reason = "stop" + if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens: + stop_reason = "length" + + total_completion_token_count += completion_token_count + + if max_tokens == 0: + all_entries = [] + else: + if logprob_proc: + all_entries = [] + for alt in logprob_proc.token_alternatives_history: + all_entries.extend(_dict_to_logprob_entries(alt)) + elif shared.args.loader in ('llama.cpp', 'ExLlamav3'): + all_entries = getattr(shared.model, 'last_completion_probabilities', None) or [] + else: + all_entries = [] + + if prompt_entries: + all_entries = prompt_entries + all_entries + + completion_logprobs = format_completion_logprobs(all_entries) if all_entries else None respi = { "index": choice_index, @@ -775,7 +951,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e raise InvalidRequestError(message="API Batched generation not yet supported.", param=prompt_str) prefix = prompt if echo else '' - token_count = len(encode(prompt)[0]) + prompt_input_ids = encode(prompt) + token_count = len(prompt_input_ids[0]) # Check if usage should be included in streaming chunks per OpenAI spec stream_options = body.get('stream_options') @@ -808,37 +985,57 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e return chunk + logprobs_val = body.get('logprobs', None) + if echo and logprobs_val is not None and logprobs_val >= 0: + prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids) + prompt_logprobs_formatted = format_completion_logprobs(prompt_entries) if prompt_entries else None + else: + prompt_logprobs_formatted = None + + # Clear stale logprobs from any previous request before building the + # first chunk, so text_streaming_chunk doesn't pick up old data. + if hasattr(shared.model, 'last_completion_probabilities'): + shared.model.last_completion_probabilities = [] + cmpl_logprobs_offset[0] = 0 + chunk = text_streaming_chunk(prefix) + if prompt_logprobs_formatted is not None: + chunk[resp_list][0]["logprobs"] = prompt_logprobs_formatted if include_usage: chunk['usage'] = None yield chunk # generate reply ####################################### - debug_msg({'prompt': prompt, 'generate_params': generate_params}) - generator = generate_reply(prompt, generate_params, is_chat=False) - answer = '' - seen_content = '' - completion_token_count = 0 + if max_tokens == 0: + answer = '' + completion_token_count = 0 + stop_reason = "stop" + else: + debug_msg({'prompt': prompt, 'generate_params': generate_params}) + generator = generate_reply(prompt, generate_params, is_chat=False) + answer = '' + seen_content = '' + completion_token_count = 0 - for a in generator: - answer = a + for a in generator: + answer = a - len_seen = len(seen_content) - new_content = answer[len_seen:] + len_seen = len(seen_content) + new_content = answer[len_seen:] - if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet. - continue + if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet. + continue - seen_content = answer - chunk = text_streaming_chunk(new_content) - if include_usage: - chunk['usage'] = None - yield chunk + seen_content = answer + chunk = text_streaming_chunk(new_content) + if include_usage: + chunk['usage'] = None + yield chunk - completion_token_count = len(encode(answer)[0]) - stop_reason = "stop" - if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens: - stop_reason = "length" + completion_token_count = len(encode(answer)[0]) + stop_reason = "stop" + if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens: + stop_reason = "length" chunk = text_streaming_chunk(suffix) chunk[resp_list][0]["finish_reason"] = stop_reason diff --git a/modules/exllamav3.py b/modules/exllamav3.py index f873503a..3782a693 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -489,15 +489,35 @@ class Exllamav3Model: return id_to_piece = self.tokenizer.get_id_to_piece_list(True) + sampled_ids = result.get("token_ids") # (batch, seq_len) - actually sampled tokens + sampled_probs = result.get("token_probs") # (batch, seq_len) - their probabilities + + def _piece(tid): + s = id_to_piece[tid] if tid < len(id_to_piece) else f"<{tid}>" + return s.replace('\u2581', ' ') + + def _logprob(prob): + return math.log(prob) if prob > 0 else float("-inf") + # top_k_tokens shape: (batch, seq_len, k), top_k_probs same for seq_idx in range(top_k_tokens.shape[1]): entry = {"top_logprobs": []} for k_idx in range(top_k_tokens.shape[2]): token_id = top_k_tokens[0, seq_idx, k_idx].item() prob = top_k_probs[0, seq_idx, k_idx].item() - token_str = id_to_piece[token_id] if token_id < len(id_to_piece) else f"<{token_id}>" - logprob = math.log(prob) if prob > 0 else float("-inf") - entry["top_logprobs"].append({"token": token_str, "logprob": logprob}) + entry["top_logprobs"].append({"token": _piece(token_id), "logprob": _logprob(prob)}) + + # Record the actually sampled token at the entry level so + # format_completion_logprobs uses it instead of top_logprobs[0] + # (they differ with non-greedy sampling). + if sampled_ids is not None: + sid = sampled_ids[0, seq_idx].item() + entry["token"] = _piece(sid) + if sampled_probs is not None: + entry["logprob"] = _logprob(sampled_probs[0, seq_idx].item()) + else: + entry["logprob"] = None + self.last_completion_probabilities.append(entry) def generate(self, prompt, state): diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index fa968be1..34080466 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -310,8 +310,45 @@ class LlamaServer: else: raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}") + def get_prompt_logprob_entries(self, token_ids, n_probs=5, prompt=""): + """Get logprob entries for prompt tokens via a single n_predict=0 request. + + Requires llama.cpp server with prompt_logprobs support. + Returns entries in the standard format for format_completion_logprobs(). + """ + token_ids_list = token_ids.tolist() if hasattr(token_ids, 'tolist') else list(token_ids) + + url = f"http://127.0.0.1:{self.port}/completion" + payload = { + "prompt": token_ids_list, + "n_predict": 0, + "n_probs": n_probs, + "prompt_logprobs": True, + "stream": False, + "cache_prompt": False, + } + + response = self.session.post(url, json=payload) + result = response.json() + + prompt_probs = result.get("prompt_probabilities", []) + if not prompt_probs: + return [] + + # Null first token (no conditioning context); use empty string for BOS + # or tokens that don't appear at the start of the prompt text. + first_token_str = self.decode([token_ids_list[0]]) + if self.bos_token and first_token_str == self.bos_token: + first_token_str = "" + elif not prompt.startswith(first_token_str): + first_token_str = "" + + entries = [{"token": first_token_str, "null_logprob": True}] + entries.extend(prompt_probs) + return entries + def _get_vocabulary_size(self): - """Get and store the model's maximum context length.""" + """Get and store the model's vocabulary size.""" url = f"http://127.0.0.1:{self.port}/v1/models" response = self.session.get(url).json() From 328534b762f22c82b09babf6b04e289eab4a7fde Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Apr 2026 12:51:07 -0700 Subject: [PATCH 13/27] Update llama.cpp --- requirements/full/requirements.txt | 8 ++++---- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 4 ++-- requirements/full/requirements_cpu_only.txt | 8 ++++---- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_amd.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 2 +- requirements/portable/requirements_apple_silicon.txt | 2 +- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cuda131.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- 12 files changed, 26 insertions(+), 26 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 6e11dd2f..57991c9a 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -40,10 +40,10 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13" https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index c964eff6..bb47ea4b 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -37,5 +37,5 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index b1dd6a4f..5750b109 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -37,5 +37,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 4d03d280..d8302d3d 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -37,5 +37,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 9d41d069..d3a5c008 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -37,7 +37,7 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index ff80b6c8..1180b42d 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index 318044da..57aa6262 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 1676bffb..894c9199 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -23,4 +23,4 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 27fc2da8..32b9727f 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -23,4 +23,4 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 0bbdd30a..73b72832 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt index c3ae3c57..ad96bbe2 100644 --- a/requirements/portable/requirements_cuda131.txt +++ b/requirements/portable/requirements_cuda131.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index e646c04c..a5df3ad4 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # Vulkan wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 4073164be0b305d8ac4a01d4259448370d009a99 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Apr 2026 19:08:37 -0700 Subject: [PATCH 14/27] Fix ExLlamav3 OOM on prompt logprobs and qwen3_5_moe HF compat --- modules/api/completions.py | 13 +++++-------- modules/exllamav3.py | 33 ++++----------------------------- modules/exllamav3_hf.py | 32 ++++++++------------------------ 3 files changed, 17 insertions(+), 61 deletions(-) diff --git a/modules/api/completions.py b/modules/api/completions.py index 587ad6ea..a15e1f86 100644 --- a/modules/api/completions.py +++ b/modules/api/completions.py @@ -91,17 +91,14 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None): import torch if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'): - # Native ExLlamav3: call the underlying Model.forward() directly + # Native ExLlamav3: call the underlying Model.forward() in chunks + # to avoid OOM from giant logits tensors (seq_len * vocab_size * 4 bytes) input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long) + input_ids_tensor = input_ids_tensor.view(-1).cpu() with torch.no_grad(): logits = model.model.forward( - input_ids=input_ids_tensor, - params={ - "attn_mode": "flash_attn", - "cache": model.cache, - "past_len": 0, - "batch_shape": (1, model.max_tokens), - } + input_ids=input_ids_tensor.view(1, -1), + params={"attn_mode": "flash_attn_nc"} ).float().cpu() elif hasattr(model, 'forward'): diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 3782a693..7556a908 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -530,39 +530,14 @@ class Exllamav3Model: def get_logits(self, token_ids, **kwargs): """ Process a batch of token_ids and return the logits for the last token. - This will reset and overwrite the model's cache. + Uses flash_attn_nc (no cache) for correct results with recurrent models. """ - # Initialize a single params dictionary that will be updated in-place - params = { - "cache": self.cache, - "reconstruct": False, - "attn_mode": "flash_attn", - "batch_shape": (1, self.max_tokens), - "past_len": 0 - } - params.update(kwargs) - - # Process prefix tokens to fill the cache and generate recurrent state - if token_ids.shape[-1] > 1: - prefix_ids = token_ids[:, :-1] - - # This forward call updates the 'params' dict with the recurrent state - self.model.forward( - input_ids=prefix_ids, - params=params - ) - - # Update past_len for the next call - params["past_len"] = prefix_ids.shape[-1] - - # Process the last token, now using the state-filled 'params' dict - last_token_ids = token_ids[:, -1:] logits = self.model.forward( - input_ids=last_token_ids, - params=params + input_ids=token_ids, + params={"attn_mode": "flash_attn_nc"} ) - return logits.float().cpu() + return logits[:, -1:, :].float().cpu() def encode(self, string, **kwargs): add_bos = kwargs.pop('add_bos', True) diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index e0ad5002..5e634e22 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -26,6 +26,9 @@ except Exception: class Exllamav3HF(PreTrainedModel, GenerationMixin): def __init__(self, model_dir): hf_config = PretrainedConfig.from_pretrained(model_dir) + # Ensure text_config is a proper object, not a dict (fixes qwen3_5_moe + transformers compat) + if isinstance(getattr(hf_config, 'text_config', None), dict): + hf_config.text_config = PretrainedConfig(**hf_config.text_config) super().__init__(hf_config) exl3_config = Config.from_directory(model_dir) @@ -199,30 +202,11 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): } ).to(input_ids.device).float() else: - # Labels path: use cache for cross-chunk attention. - tokens_to_process = seq_tensor - all_logits = None - current_len = 0 - - for i in range(0, tokens_to_process.shape[0], max_chunk_size): - chunk = tokens_to_process[i:i + max_chunk_size] - chunk_logits = self.ex_model.forward( - input_ids=chunk.view(1, -1), - params={ - "attn_mode": "flash_attn", - "cache": ex_cache, - "past_len": current_len, - "batch_shape": (1, self.max_tokens), - } - ).float() - current_len += chunk.shape[0] - - if all_logits is None: - all_logits = chunk_logits - else: - all_logits = torch.cat([all_logits, chunk_logits], dim=1) - - logits = all_logits + # Labels path: single pass without cache for correct logits + logits = self.ex_model.forward( + input_ids=seq_tensor.view(1, -1), + params={"attn_mode": "flash_attn_nc"} + ).float().cpu() if is_negative: self.past_seq_negative = seq_tensor From a32ce254f275efe473d6624995957b3b6bd51aa1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Apr 2026 20:28:44 -0700 Subject: [PATCH 15/27] Don't pass torch_dtype to transformers, autodetect from model config --- modules/transformers_loader.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py index 7f521b8c..5964f012 100644 --- a/modules/transformers_loader.py +++ b/modules/transformers_loader.py @@ -109,7 +109,6 @@ def load_model_HF(model_name): params = { 'low_cpu_mem_usage': True, 'attn_implementation': shared.args.attn_implementation, - 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16, } if shared.original_args.trust_remote_code: @@ -120,6 +119,17 @@ def load_model_HF(model_name): config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.original_args.trust_remote_code) + # Determine torch_dtype: respect --bf16 flag, otherwise autodetect + # from model config, but never allow float32. + if shared.args.bf16: + params['torch_dtype'] = torch.bfloat16 + else: + dtype = getattr(config, 'torch_dtype', None) or getattr(getattr(config, 'text_config', None), 'torch_dtype', None) + if dtype in (torch.float16, torch.bfloat16): + params['torch_dtype'] = dtype + else: + params['torch_dtype'] = torch.float16 + if 'chatglm' in model_name.lower(): LoaderClass = AutoModel else: From c10c6e87ae0b0085b36e7e13269461744ce04ff6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 07:17:27 -0700 Subject: [PATCH 16/27] API: Add token ids to logprobs output --- modules/api/completions.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/modules/api/completions.py b/modules/api/completions.py index a15e1f86..453fa07b 100644 --- a/modules/api/completions.py +++ b/modules/api/completions.py @@ -143,17 +143,17 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None): if token_id in top_ids: actual_lp = top_log_probs[top_ids.index(token_id)].item() alternatives = [ - {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()} + {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()} for j in range(k) if top_ids[j] != token_id ] else: actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item() alternatives = [ - {"token": decoded_strs[top_ids[j]], "logprob": top_log_probs[j].item()} + {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()} for j in range(k - 1) # drop lowest to make room ] - entry = {"top_logprobs": [{"token": actual_token_str, "logprob": actual_lp}] + alternatives} + entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives} entries.append(entry) return entries @@ -239,7 +239,7 @@ def format_chat_logprobs(entries): def format_completion_logprobs(entries): """Format logprob entries into OpenAI completions logprobs format. - Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "text_offset"} + Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "top_logprobs_ids": [{token_id: prob}], "text_offset"} """ if not entries: return None @@ -247,6 +247,7 @@ def format_completion_logprobs(entries): tokens = [] token_logprobs = [] top_logprobs = [] + top_logprobs_ids = [] text_offset = [] offset = 0 @@ -257,6 +258,7 @@ def format_completion_logprobs(entries): tokens.append(token_str) token_logprobs.append(None) top_logprobs.append(None) + top_logprobs_ids.append(None) text_offset.append(offset) offset += len(token_str) continue @@ -273,21 +275,28 @@ def format_completion_logprobs(entries): offset += len(token_str) top_dict = {} + top_dict_ids = {} for item in top: t = item.get('token', '') lp = item.get('logprob', item.get('prob', 0)) top_dict[t] = lp + if 'token_id' in item: + top_dict_ids[item['token_id']] = lp top_logprobs.append(top_dict) + top_logprobs_ids.append(top_dict_ids if top_dict_ids else None) if not tokens: return None - return { + result = { "tokens": tokens, "token_logprobs": token_logprobs, "top_logprobs": top_logprobs, "text_offset": text_offset } + if any(x is not None for x in top_logprobs_ids): + result["top_logprobs_ids"] = top_logprobs_ids + return result def process_parameters(body, is_legacy=False): From ea1f8c71f2e92dc9ae230b943c605e43ff5c633c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 14:30:59 -0300 Subject: [PATCH 17/27] API: Optimize prompt logprobs and refactor ExLlamav3 forward pass --- modules/api/completions.py | 69 ++++++++++++++++++++++++-------------- modules/exllamav3.py | 14 ++++++++ 2 files changed, 58 insertions(+), 25 deletions(-) diff --git a/modules/api/completions.py b/modules/api/completions.py index 453fa07b..4eb8fdad 100644 --- a/modules/api/completions.py +++ b/modules/api/completions.py @@ -90,16 +90,8 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None): import torch - if loader == 'ExLlamav3' and hasattr(model, 'model') and hasattr(model, 'cache'): - # Native ExLlamav3: call the underlying Model.forward() in chunks - # to avoid OOM from giant logits tensors (seq_len * vocab_size * 4 bytes) - input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long) - input_ids_tensor = input_ids_tensor.view(-1).cpu() - with torch.no_grad(): - logits = model.model.forward( - input_ids=input_ids_tensor.view(1, -1), - params={"attn_mode": "flash_attn_nc"} - ).float().cpu() + if hasattr(model, 'get_prompt_logits'): + logits = model.get_prompt_logits(input_ids) elif hasattr(model, 'forward'): # HF-compatible loaders (Transformers, ExLlamav3_HF, etc.) @@ -111,26 +103,54 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None): # not just the last token (some HF wrappers like ExLlamav3_HF # only compute the last-token logits when labels are absent). outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor) - logits = outputs.logits.float().cpu() + logits = outputs.logits # keep on GPU, (1, seq_len, vocab) in model dtype + del outputs else: return [] entries = [{"token": first_token_str, "null_logprob": True}] - # Batch logsumexp and topk as single operations across all positions - # to avoid per-position kernel launch overhead. - prompt_logits = logits[0, :n_tokens - 1] # positions 0..n-2 predict tokens 1..n-1 - k = min(logprobs_count, prompt_logits.shape[-1]) - all_top_values, all_top_indices = torch.topk(prompt_logits, k=k, dim=-1) - all_lse = torch.logsumexp(prompt_logits, dim=-1) - all_top_log_probs = all_top_values - all_lse.unsqueeze(-1) - - # Batch-decode all unique token IDs to avoid O(N*k) individual decode calls + logprobs_count = max(logprobs_count, 1) + k = min(logprobs_count, logits.shape[-1]) + chunk_size = 2048 unique_ids = set(int(tid) for tid in token_ids[1:]) - unique_ids.update(int(tid) for tid in all_top_indices.flatten().tolist()) - decoded_strs = {tid: shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids} + # Process logits in chunks on GPU, only move top-K results to CPU + all_top_log_probs_list = [] + all_top_indices_list = [] + all_actual_lps = [] + + for start in range(0, n_tokens - 1, chunk_size): + end = min(start + chunk_size, n_tokens - 1) + chunk_logits = logits[0, start:end].float() # (chunk, vocab) on GPU + chunk_lse = torch.logsumexp(chunk_logits, dim=-1) + chunk_top_values, chunk_top_indices = torch.topk(chunk_logits, k=k, dim=-1) + chunk_top_log_probs = chunk_top_values - chunk_lse.unsqueeze(-1) + + # Compute logprob for actual next tokens in this chunk + chunk_top_sets = [set(chunk_top_indices[j].tolist()) for j in range(end - start)] + for j in range(end - start): + actual_tid = int(token_ids[start + j + 1]) + if actual_tid not in chunk_top_sets[j]: + all_actual_lps.append((chunk_logits[j, actual_tid] - chunk_lse[j]).item()) + else: + all_actual_lps.append(None) # will use top_log_probs + + all_top_log_probs_list.append(chunk_top_log_probs.cpu()) + all_top_indices_list.append(chunk_top_indices.cpu()) + unique_ids.update(int(tid) for tid in chunk_top_indices.flatten().tolist()) + del chunk_logits, chunk_lse, chunk_top_values + + del logits + torch.cuda.empty_cache() + + all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0) + all_top_indices = torch.cat(all_top_indices_list, dim=0) + + unique_ids_list = sorted(unique_ids) + decoded_list = shared.tokenizer.batch_decode([[tid] for tid in unique_ids_list]) if hasattr(shared.tokenizer, 'batch_decode') else [shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids_list] + decoded_strs = dict(zip(unique_ids_list, decoded_list)) for i in range(1, n_tokens): token_id = int(token_ids[i]) @@ -139,7 +159,6 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None): top_ids = all_top_indices[idx].tolist() actual_token_str = decoded_strs[token_id] - # Build the top list with the actual prompt token guaranteed at front if token_id in top_ids: actual_lp = top_log_probs[top_ids.index(token_id)].item() alternatives = [ @@ -147,10 +166,10 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None): for j in range(k) if top_ids[j] != token_id ] else: - actual_lp = (prompt_logits[idx, token_id] - all_lse[idx]).item() + actual_lp = all_actual_lps[idx] alternatives = [ {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()} - for j in range(k - 1) # drop lowest to make room + for j in range(k - 1) ] entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives} diff --git a/modules/exllamav3.py b/modules/exllamav3.py index 7556a908..e1efbfeb 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -527,6 +527,20 @@ class Exllamav3Model: return output + def get_prompt_logits(self, input_ids): + """Return logits for all positions via a single no-cache forward pass. + + Used by prompt logprobs computation. Returns (1, seq_len, vocab) on CPU in float32. + """ + import torch + input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long) + input_ids_tensor = input_ids_tensor.view(1, -1).cpu() + with torch.no_grad(): + return self.model.forward( + input_ids=input_ids_tensor, + params={"attn_mode": "flash_attn_nc"} + ).cpu().float() + def get_logits(self, token_ids, **kwargs): """ Process a batch of token_ids and return the logits for the last token. From c50e17bdbe1da850189188afaf0682a952efa0d1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 14:49:31 -0300 Subject: [PATCH 18/27] Add dedicated ik portable requirements files and remove macOS ik builds --- .github/workflows/build-everything-tgw.yml | 7 --- .../build-portable-release-ik-cuda.yml | 9 ++-- .../workflows/build-portable-release-ik.yml | 44 +++---------------- requirements/portable/requirements_ik.txt | 27 ++++++++++++ .../portable/requirements_ik_cpu_only.txt | 27 ++++++++++++ .../portable/requirements_ik_cuda131.txt | 27 ++++++++++++ 6 files changed, 91 insertions(+), 50 deletions(-) create mode 100644 requirements/portable/requirements_ik.txt create mode 100644 requirements/portable/requirements_ik_cpu_only.txt create mode 100644 requirements/portable/requirements_ik_cuda131.txt diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml index 4de591f4..40d9db5d 100644 --- a/.github/workflows/build-everything-tgw.yml +++ b/.github/workflows/build-everything-tgw.yml @@ -96,10 +96,3 @@ jobs: with: version: ${{ inputs.version }} config: 'os:ubuntu-22.04' - - build_release_ik_macos: - name: ik macOS - uses: ./.github/workflows/build-portable-release-ik.yml - with: - version: ${{ inputs.version }} - config: 'os:macos-14' diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml index 40b4b92f..331a7653 100644 --- a/.github/workflows/build-portable-release-ik-cuda.yml +++ b/.github/workflows/build-portable-release-ik-cuda.yml @@ -138,14 +138,13 @@ jobs: # 3. Prepare requirements file based on CUDA version cd "text-generation-webui-${VERSION_CLEAN}" if [[ "$CUDA_VERSION" == "13.1" ]]; then - REQ_FILE="requirements/portable/requirements_cuda131.txt" + REQ_FILE="requirements/portable/requirements_ik_cuda131.txt" else - REQ_FILE="requirements/portable/requirements.txt" + REQ_FILE="requirements/portable/requirements_ik.txt" fi - # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts - sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE" - sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true + # 4. Inject --ik into start scripts + sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true # 5. Install packages echo "Installing Python packages from $REQ_FILE..." diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml index afb2e763..bf54eb0e 100644 --- a/.github/workflows/build-portable-release-ik.yml +++ b/.github/workflows/build-portable-release-ik.yml @@ -1,4 +1,4 @@ -name: Build ik CPU and macOS +name: Build ik CPU on: workflow_dispatch: @@ -57,7 +57,7 @@ jobs: id: set-matrix run: | $matrix = @{ - 'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14') + 'os' = @('ubuntu-22.04', 'windows-2022') 'pyver' = @("3.13") } @@ -110,7 +110,6 @@ jobs: # Define common variables VERSION="${{ inputs.version }}" - OS_TYPE="${{ matrix.os }}" # 1. Set platform-specific variables if [[ "$RUNNER_OS" == "Windows" ]]; then @@ -119,21 +118,7 @@ jobs: PIP_PATH="portable_env/python.exe -m pip" PACKAGES_PATH="portable_env/Lib/site-packages" rm start_linux.sh start_macos.sh - elif [[ "$RUNNER_OS" == "macOS" ]]; then - if [[ "$OS_TYPE" == "macos-15-intel" ]]; then - PLATFORM="macos-x86_64" - PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz" - REQ_TYPE="apple_intel" - else - PLATFORM="macos-arm64" - PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz" - REQ_TYPE="apple_silicon" - fi - PIP_PATH="portable_env/bin/python -m pip" - PACKAGES_PATH="portable_env/lib/python3.13/site-packages" - rm start_linux.sh start_windows.bat else - # Linux case PLATFORM="linux-cpu" PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" PIP_PATH="portable_env/bin/python -m pip" @@ -148,30 +133,13 @@ jobs: tar -xzf python-build.tar.gz mv python "text-generation-webui-${VERSION_CLEAN}/portable_env" - # 3. Prepare requirements file based on platform + # 3. Prepare requirements file cd "text-generation-webui-${VERSION_CLEAN}" - - # Select requirements file based on platform - if [[ "$RUNNER_OS" == "macOS" ]]; then - if [[ "$OS_TYPE" == "macos-15-intel" ]]; then - REQ_FILE="requirements/portable/requirements_apple_intel.txt" - else - REQ_FILE="requirements/portable/requirements_apple_silicon.txt" - fi - else - REQ_FILE="requirements/portable/requirements_cpu_only.txt" - fi - + REQ_FILE="requirements/portable/requirements_ik_cpu_only.txt" echo "Using requirements file: $REQ_FILE" - # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts - if [[ "$RUNNER_OS" == "macOS" ]]; then - sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE" - sed -i '' 's/--portable/--portable --ik/g' start_macos.sh - else - sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE" - sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true - fi + # 4. Inject --ik into start scripts + sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true # 5. Install packages echo "Installing Python packages from $REQ_FILE..." diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt new file mode 100644 index 00000000..2fa037f7 --- /dev/null +++ b/requirements/portable/requirements_ik.txt @@ -0,0 +1,27 @@ +audioop-lts<1.0; python_version >= "3.13" +fastapi==0.112.4 +huggingface-hub==1.5.* +jinja2==3.1.6 +markdown +numpy==2.2.* +pydantic==2.11.0 +pymupdf==1.27.* +python-docx==1.1.2 +pyyaml +requests +rich +trafilatura==2.0.0 +tqdm + +# Gradio +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl + +# API +flask_cloudflared==0.0.15 +sse-starlette==1.6.5 +tiktoken + +# CUDA wheels +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt new file mode 100644 index 00000000..b43b51c4 --- /dev/null +++ b/requirements/portable/requirements_ik_cpu_only.txt @@ -0,0 +1,27 @@ +audioop-lts<1.0; python_version >= "3.13" +fastapi==0.112.4 +huggingface-hub==1.5.* +jinja2==3.1.6 +markdown +numpy==2.2.* +pydantic==2.11.0 +pymupdf==1.27.* +python-docx==1.1.2 +pyyaml +requests +rich +trafilatura==2.0.0 +tqdm + +# Gradio +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl + +# API +flask_cloudflared==0.0.15 +sse-starlette==1.6.5 +tiktoken + +# ik_llama.cpp (CPU only) +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt new file mode 100644 index 00000000..12767285 --- /dev/null +++ b/requirements/portable/requirements_ik_cuda131.txt @@ -0,0 +1,27 @@ +audioop-lts<1.0; python_version >= "3.13" +fastapi==0.112.4 +huggingface-hub==1.5.* +jinja2==3.1.6 +markdown +numpy==2.2.* +pydantic==2.11.0 +pymupdf==1.27.* +python-docx==1.1.2 +pyyaml +requests +rich +trafilatura==2.0.0 +tqdm + +# Gradio +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl + +# API +flask_cloudflared==0.0.15 +sse-starlette==1.6.5 +tiktoken + +# CUDA wheels +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 8f8b57a029715d07ab164aa22a779ea7ea4619f1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 10:54:20 -0700 Subject: [PATCH 19/27] Update exllamav3 --- requirements/full/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 57991c9a..5591c9ca 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -44,7 +44,7 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13" https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13" https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13" From 6a1f720c7bb9aef73c1c7c4e311460174c5255ec Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 10:58:20 -0700 Subject: [PATCH 20/27] Update transformers --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 5591c9ca..30ee0316 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -25,7 +25,7 @@ sentencepiece tensorboard torchao==0.15.* trafilatura==2.0.0 -transformers==5.3.* +transformers==5.5.* triton-windows==3.5.1.post24; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index bb47ea4b..9edc1d95 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -22,7 +22,7 @@ scipy sentencepiece tensorboard torchao==0.15.* -transformers==5.3.* +transformers==5.5.* tqdm trafilatura==2.0.0 wandb diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 5750b109..ff8687c1 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -22,7 +22,7 @@ scipy sentencepiece tensorboard torchao==0.15.* -transformers==5.3.* +transformers==5.5.* tqdm trafilatura==2.0.0 wandb diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index d8302d3d..208632e8 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -22,7 +22,7 @@ scipy sentencepiece tensorboard torchao==0.15.* -transformers==5.3.* +transformers==5.5.* tqdm trafilatura==2.0.0 wandb diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index d3a5c008..4a7e5aaa 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -22,7 +22,7 @@ scipy sentencepiece tensorboard torchao==0.15.* -transformers==5.3.* +transformers==5.5.* tqdm trafilatura==2.0.0 wandb diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 052085cc..6200589e 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -22,7 +22,7 @@ scipy sentencepiece tensorboard torchao==0.15.* -transformers==5.3.* +transformers==5.5.* tqdm trafilatura==2.0.0 wandb From 468cb5cb87bf02f96efcd5acb1d1ac4b08c68273 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 10:59:28 -0700 Subject: [PATCH 21/27] Update accelerate --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 30ee0316..e5bec6ec 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -1,4 +1,4 @@ -accelerate==1.12.* +accelerate==1.13.* audioop-lts<1.0; python_version >= "3.13" bitsandbytes==0.49.* datasets diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 9edc1d95..c6b5b2d0 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -1,4 +1,4 @@ -accelerate==1.12.* +accelerate==1.13.* audioop-lts<1.0; python_version >= "3.13" datasets diffusers==0.37.* diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index ff8687c1..ce671f0a 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -1,4 +1,4 @@ -accelerate==1.12.* +accelerate==1.13.* audioop-lts<1.0; python_version >= "3.13" datasets diffusers==0.37.* diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 208632e8..d12d9f80 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -1,4 +1,4 @@ -accelerate==1.12.* +accelerate==1.13.* audioop-lts<1.0; python_version >= "3.13" datasets diffusers==0.37.* diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 4a7e5aaa..4066b1af 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -1,4 +1,4 @@ -accelerate==1.12.* +accelerate==1.13.* audioop-lts<1.0; python_version >= "3.13" datasets diffusers==0.37.* diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 6200589e..7173345a 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -1,4 +1,4 @@ -accelerate==1.12.* +accelerate==1.13.* audioop-lts<1.0; python_version >= "3.13" datasets diffusers==0.37.* From 80e81a54cacacbd8aa16ccf312ae0e574e4b416c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 11:11:44 -0700 Subject: [PATCH 22/27] Remove ik macOS wheels from full requirements --- requirements/full/requirements_apple_intel.txt | 1 - requirements/full/requirements_apple_silicon.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index ce671f0a..55a313e9 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -38,4 +38,3 @@ tiktoken # Mac wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index d12d9f80..a6d34cbb 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -38,4 +38,3 @@ tiktoken # Mac wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" From f6f8f14c8d0993327a2c86dfa3c976a7c1c569fc Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 16:13:39 -0300 Subject: [PATCH 23/27] Security: Fix SSRF in superbooga extensions --- extensions/superbooga/download_urls.py | 3 +++ extensions/superboogav2/download_urls.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/extensions/superbooga/download_urls.py b/extensions/superbooga/download_urls.py index 424a9885..b28fea42 100644 --- a/extensions/superbooga/download_urls.py +++ b/extensions/superbooga/download_urls.py @@ -2,8 +2,11 @@ import concurrent.futures import requests +from modules.web_search import _validate_url + def download_single(url): + _validate_url(url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } diff --git a/extensions/superboogav2/download_urls.py b/extensions/superboogav2/download_urls.py index 5b5a2e17..4d8b98b1 100644 --- a/extensions/superboogav2/download_urls.py +++ b/extensions/superboogav2/download_urls.py @@ -5,12 +5,14 @@ import requests from bs4 import BeautifulSoup import extensions.superboogav2.parameters as parameters +from modules.web_search import _validate_url from .data_processor import process_and_add_to_collector from .utils import create_metadata_source def _download_single(url): + _validate_url(url) response = requests.get(url, timeout=5) if response.status_code == 200: return response.content From 091037ec20743ac6c7bccb75b59743045a692c4a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 16:13:45 -0300 Subject: [PATCH 24/27] Fix top_logprobs_ids missing for llama.cpp loader --- modules/api/completions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/api/completions.py b/modules/api/completions.py index 4eb8fdad..98bcff47 100644 --- a/modules/api/completions.py +++ b/modules/api/completions.py @@ -299,8 +299,9 @@ def format_completion_logprobs(entries): t = item.get('token', '') lp = item.get('logprob', item.get('prob', 0)) top_dict[t] = lp - if 'token_id' in item: - top_dict_ids[item['token_id']] = lp + tid = item.get('token_id', item.get('id')) + if tid is not None: + top_dict_ids[tid] = lp top_logprobs.append(top_dict) top_logprobs_ids.append(top_dict_ids if top_dict_ids else None) From a61bde509ff44a0f7662067bc94efd7f103f3162 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 17:30:02 -0700 Subject: [PATCH 25/27] Update llama.cpp --- requirements/full/requirements.txt | 8 ++++---- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 8 ++++---- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_amd.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 2 +- requirements/portable/requirements_apple_silicon.txt | 2 +- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cuda131.txt | 4 ++-- requirements/portable/requirements_ik.txt | 4 ++-- requirements/portable/requirements_ik_cpu_only.txt | 4 ++-- requirements/portable/requirements_ik_cuda131.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- 15 files changed, 30 insertions(+), 30 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index e5bec6ec..f1a953a5 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -40,10 +40,10 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13" https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index c6b5b2d0..211600e2 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -37,5 +37,5 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 55a313e9..54d904dd 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -37,4 +37,4 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index a6d34cbb..8829eb44 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -37,4 +37,4 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 4066b1af..0a8cfac6 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -37,7 +37,7 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 1180b42d..607c642f 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index 57aa6262..f0af64c8 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 894c9199..c5f351c5 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -23,4 +23,4 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 32b9727f..5287aa25 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -23,4 +23,4 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 73b72832..038318ab 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt index ad96bbe2..d87c741e 100644 --- a/requirements/portable/requirements_cuda131.txt +++ b/requirements/portable/requirements_cuda131.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt index 2fa037f7..3e2471ae 100644 --- a/requirements/portable/requirements_ik.txt +++ b/requirements/portable/requirements_ik.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt index b43b51c4..8272b9b6 100644 --- a/requirements/portable/requirements_ik_cpu_only.txt +++ b/requirements/portable/requirements_ik_cpu_only.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # ik_llama.cpp (CPU only) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt index 12767285..98ef23d7 100644 --- a/requirements/portable/requirements_ik_cuda131.txt +++ b/requirements/portable/requirements_ik_cuda131.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/ik_llama_cpp_binaries-0.101.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index a5df3ad4..157ad313 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -23,5 +23,5 @@ sse-starlette==1.6.5 tiktoken # Vulkan wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.101.0/llama_cpp_binaries-0.101.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From d84157403a1c8b65f8597302463e46c28a6659d1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 17:31:44 -0700 Subject: [PATCH 26/27] Update the custom gradio wheels --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 4 ++-- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_nowheels.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_amd.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 4 ++-- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cuda131.txt | 4 ++-- requirements/portable/requirements_ik.txt | 4 ++-- requirements/portable/requirements_ik_cpu_only.txt | 4 ++-- requirements/portable/requirements_ik_cuda131.txt | 4 ++-- requirements/portable/requirements_nowheels.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- 17 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index f1a953a5..b38ae848 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -31,8 +31,8 @@ tqdm wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 211600e2..7fb3a7d9 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -28,8 +28,8 @@ trafilatura==2.0.0 wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 54d904dd..4a0f764c 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -28,8 +28,8 @@ trafilatura==2.0.0 wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 8829eb44..942d5d71 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -28,8 +28,8 @@ trafilatura==2.0.0 wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 0a8cfac6..6b61dca7 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -28,8 +28,8 @@ trafilatura==2.0.0 wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 7173345a..a4d6cc97 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -28,8 +28,8 @@ trafilatura==2.0.0 wandb # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 607c642f..5aff54b2 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index f0af64c8..0771f53e 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index c5f351c5..427d59b2 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 5287aa25..c47a6ca1 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 038318ab..e491e357 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt index d87c741e..5870983a 100644 --- a/requirements/portable/requirements_cuda131.txt +++ b/requirements/portable/requirements_cuda131.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_ik.txt b/requirements/portable/requirements_ik.txt index 3e2471ae..d11d337d 100644 --- a/requirements/portable/requirements_ik.txt +++ b/requirements/portable/requirements_ik.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_ik_cpu_only.txt b/requirements/portable/requirements_ik_cpu_only.txt index 8272b9b6..c2b69e1c 100644 --- a/requirements/portable/requirements_ik_cpu_only.txt +++ b/requirements/portable/requirements_ik_cpu_only.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_ik_cuda131.txt b/requirements/portable/requirements_ik_cuda131.txt index 98ef23d7..7f280930 100644 --- a/requirements/portable/requirements_ik_cuda131.txt +++ b/requirements/portable/requirements_ik_cuda131.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index e38140ce..322056be 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 157ad313..dfd52be5 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -14,8 +14,8 @@ trafilatura==2.0.0 tqdm # Gradio -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl -https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl +https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl # API flask_cloudflared==0.0.15 From 7aab2fdf9aefb0f14fbf58e132a2a9a5850f8319 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 2 Apr 2026 17:50:42 -0700 Subject: [PATCH 27/27] API: Improve cache clearing in logprobs --- modules/api/completions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/api/completions.py b/modules/api/completions.py index 98bcff47..f2282731 100644 --- a/modules/api/completions.py +++ b/modules/api/completions.py @@ -89,6 +89,7 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None): return [{"token": first_token_str, "null_logprob": True}] import torch + from modules.torch_utils import clear_torch_cache if hasattr(model, 'get_prompt_logits'): logits = model.get_prompt_logits(input_ids) @@ -143,7 +144,7 @@ def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None): del chunk_logits, chunk_lse, chunk_top_values del logits - torch.cuda.empty_cache() + clear_torch_cache() all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0) all_top_indices = torch.cat(all_top_indices_list, dim=0)