From d746484521c527f64b66264bc9d3ecc22b7461c2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 23 Jul 2025 11:52:51 -0700 Subject: [PATCH 01/27] Handle both int and str types in grammar char processing --- modules/grammar/grammar_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/grammar/grammar_utils.py b/modules/grammar/grammar_utils.py index 7f09ff82..af78f6b9 100644 --- a/modules/grammar/grammar_utils.py +++ b/modules/grammar/grammar_utils.py @@ -463,7 +463,7 @@ class IncrementalGrammarConstraint(GrammarConstraint): super().__init__(grammar_str, start_rule_name, tokenizer) def accept_char(self, char, stacks): - byte = ord(char) + byte = char if isinstance(char, int) else ord(char) new_stacks = [] for stack in stacks: # stack is empty @@ -549,7 +549,7 @@ class IncrementalGrammarConstraint(GrammarConstraint): # For each sub-rule in the grammar, cache whether each byte is accepted. @lru_cache(maxsize=None) def pos_char_acceptance(self, pos, char): - byte = ord(char) + byte = char if isinstance(char, int) else ord(char) num_chars = self.grammar_encoding[pos] pos += 1 for i in range(0, num_chars, 2): From f08bb9a2012eeac213232c2fe087ba330b1801fb Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 24 Jul 2025 10:34:59 -0700 Subject: [PATCH 02/27] Handle edge case in chat history loading (closes #7155) --- modules/chat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/chat.py b/modules/chat.py index 827b6050..1a16a689 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -1175,6 +1175,9 @@ def save_last_chat_state(character, mode, unique_id): def load_history(unique_id, character, mode): p = get_history_file_path(unique_id, character, mode) + if not p.exists(): + return {'internal': [], 'visible': [], 'metadata': {}} + f = json.loads(open(p, 'rb').read()) if 'internal' in f and 'visible' in f: history = f From 74230f559ab5e8536ff22352c4910191667ab12c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 1 Aug 2025 11:03:15 -0700 Subject: [PATCH 03/27] Bump transformers to 4.54 --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- requirements/full/requirements_cpu_only_noavx2.txt | 2 +- requirements/full/requirements_cuda128.txt | 2 +- requirements/full/requirements_cuda128_noavx2.txt | 2 +- requirements/full/requirements_noavx2.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 687f1f5a..9810c65a 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 8224d987..314e7d4f 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 22141a8a..c7a8ba9b 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 04325464..6ce6ae9b 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 9497575f..53128210 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 8a84e403..6ba29008 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 9488f5e7..a1bd0ffc 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index a2af5108..f21a5208 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 948a275a..e76d6668 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 8f7106e4..5fbe49e7 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 69a82184..21588344 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.54.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb From 02a3420a506631f50bdb3dbd3d6c22ef4344c343 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 10:09:30 -0700 Subject: [PATCH 04/27] Bump transformers to 4.55 (adds gpt-oss support) --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- requirements/full/requirements_cpu_only_noavx2.txt | 2 +- requirements/full/requirements_cuda128.txt | 2 +- requirements/full/requirements_cuda128_noavx2.txt | 2 +- requirements/full/requirements_noavx2.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 9810c65a..3a30a6c7 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 314e7d4f..ae269bc9 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index c7a8ba9b..14871b4b 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 6ce6ae9b..49357939 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 53128210..de33cdb8 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 6ba29008..58496d9d 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index a1bd0ffc..c5322076 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index f21a5208..804ef934 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index e76d6668..06d93d65 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 5fbe49e7..f9e5fb73 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 21588344..cd85a744 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.54.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb From 59890435376110f58656037b1871be553bafbb1f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 11:22:18 -0700 Subject: [PATCH 05/27] Transformers: Support standalone .jinja chat templates (for GPT-OSS) --- modules/models_settings.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index a06e594e..c4dfb0ed 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -122,13 +122,25 @@ def get_model_metadata(model): # Try to find the Jinja instruct template path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json' + template = None + + # 1. Prioritize reading from chat_template.jinja if it exists + jinja_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.jinja' + if jinja_path.exists(): + with open(jinja_path, 'r', encoding='utf-8') as f: + template = f.read() + if path.exists(): metadata = json.loads(open(path, 'r', encoding='utf-8').read()) - if 'chat_template' in metadata: + + # 2. Only read from metadata if we haven't already loaded from .jinja + if template is None and 'chat_template' in metadata: template = metadata['chat_template'] if isinstance(template, list): template = template[0]['template'] + # 3. If a template was found from either source, process it + if template: for k in ['eos_token', 'bos_token']: if k in metadata: value = metadata[k] From 3039aeffeb8958724de56912d0d90267b87a7074 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 11:35:17 -0700 Subject: [PATCH 06/27] Fix parsing the gpt-oss-20b template --- modules/models_settings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index c4dfb0ed..8ed7f953 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -90,7 +90,7 @@ def get_model_metadata(model): template = template.replace('eos_token', "'{}'".format(eos_token)) template = template.replace('bos_token', "'{}'".format(bos_token)) - template = re.sub(r'raise_exception\([^)]*\)', "''", template) + template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template @@ -149,7 +149,7 @@ def get_model_metadata(model): template = template.replace(k, "'{}'".format(value)) - template = re.sub(r'raise_exception\([^)]*\)', "''", template) + template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template From 3b28dc182186308648af5cddb44c811f1608c70d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 11:35:53 -0700 Subject: [PATCH 07/27] Don't pass torch_dtype to transformers loader, let it be autodetected --- modules/transformers_loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py index 2f7367a4..e4072125 100644 --- a/modules/transformers_loader.py +++ b/modules/transformers_loader.py @@ -136,7 +136,6 @@ def load_model_HF(model_name): path_to_model = Path(f'{shared.args.model_dir}/{model_name}') params = { 'low_cpu_mem_usage': True, - 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16, 'attn_implementation': shared.args.attn_implementation, } From 9f28f53cfc7d14cf8e3c9ebd00834126f73675b4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 11:56:00 -0700 Subject: [PATCH 08/27] Better parsing of the gpt-oss template --- modules/chat.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 1a16a689..c10d91a7 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -108,7 +108,14 @@ def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=Tru suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0] suffix = prompt.split("<<|user-message-2|>>")[1] - prefix = suffix_plus_prefix[len(suffix):] + + # Remove the message suffix. The first case handles the GPT-OSS model + # in a way that is likely to not interfere with previous models. + if '<|start|>user' in suffix_plus_prefix or '<|start|>assistant' in suffix_plus_prefix: + start_index = suffix_plus_prefix.rindex('<|start|>') + prefix = suffix_plus_prefix[start_index:] + else: + prefix = suffix_plus_prefix[len(suffix):] if strip_trailing_spaces: prefix = prefix.rstrip(' ') From 178c3e75cca827657a018a64ae3d7945d9e25231 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 12:38:06 -0700 Subject: [PATCH 09/27] Handle templates with channels separately --- modules/chat.py | 184 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 157 insertions(+), 27 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index c10d91a7..f929f653 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -86,6 +86,134 @@ yaml.add_representer(str, str_presenter) yaml.representer.SafeRepresenter.add_representer(str, str_presenter) +# Template Handler Classes +class TemplateHandler: + """Base class for handling different template types""" + + def __init__(self, template_str): + self.template_str = template_str + + def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True): + """Get prefix/suffix for generation""" + return "", "" + + def get_stopping_strings(self, renderer): + """Get stopping strings for this template type""" + return [] + + def modify_for_continue(self, prompt, renderer, impersonate=False): + """Modify prompt for continue mode""" + return prompt + + def supports_impersonate(self): + """Whether impersonate mode is supported""" + return False + + +class LinearTemplateHandler(TemplateHandler): + """Handles traditional linear templates""" + + def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True): + # This is the original, complex logic for deriving prefix/suffix for old templates. + if impersonate: + messages = [ + {"role": "user", "content": "<<|user-message-1|>>"}, + {"role": "user", "content": "<<|user-message-2|>>"}, + ] + else: + messages = [ + {"role": "assistant", "content": "<<|user-message-1|>>"}, + {"role": "assistant", "content": "<<|user-message-2|>>"}, + ] + + prompt = renderer(messages=messages) + suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0] + suffix = prompt.split("<<|user-message-2|>>")[1] + + if '<|start|>user' in suffix_plus_prefix or '<|start|>assistant' in suffix_plus_prefix: + start_index = suffix_plus_prefix.rindex('<|start|>') + prefix = suffix_plus_prefix[start_index:] + else: + prefix = suffix_plus_prefix[len(suffix):] + + if strip_trailing_spaces: + prefix = prefix.rstrip(' ') + + return prefix, suffix + + def get_stopping_strings(self, renderer): + # This is the original, correct logic for dynamically creating stopping strings for linear templates. + prefix_bot, suffix_bot = self.get_generation_prefix_suffix(renderer, impersonate=False) + prefix_user, suffix_user = self.get_generation_prefix_suffix(renderer, impersonate=True) + + stopping_strings = [ + suffix_user + prefix_bot, + suffix_user + prefix_user, + suffix_bot + prefix_bot, + suffix_bot + prefix_user, + ] + + # Attempt to find a single EOT token to use as a stop string + for item in stopping_strings: + item = item.strip() + if item.startswith("<") and ">" in item: + stopping_strings.append(item.split(">")[0] + ">") + break + elif item.startswith("[") and "]" in item: + stopping_strings.append(item.split("]")[0] + "]") + break + + return stopping_strings + + def modify_for_continue(self, prompt, renderer, impersonate=False): + suffix = self.get_generation_prefix_suffix(renderer, impersonate)[1] + if len(suffix) > 0: + return prompt[:-len(suffix)] + return prompt + + def supports_impersonate(self): + return True + + +class ChannelTemplateHandler(TemplateHandler): + """Handles channel-based templates""" + + def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True): + """ + Gets the string to add to the prompt to start a new generation. + """ + dummy_message = [{'role': 'user', 'content': '...'}] + prompt_without_gen = renderer(messages=dummy_message, add_generation_prompt=False) + prompt_with_gen = renderer(messages=dummy_message, add_generation_prompt=True) + generation_prompt = prompt_with_gen[len(prompt_without_gen):] + + if strip_trailing_spaces: + generation_prompt = generation_prompt.rstrip(' ') + + return generation_prompt, "" + + def get_stopping_strings(self, renderer): + return [ + '<|return|>', + '<|start|>user', + '<|start|>developer', + '<|call|>' + ] + + def modify_for_continue(self, prompt, renderer, impersonate=False): + return prompt + + def supports_impersonate(self): + return False + + +def create_template_handler(template_str): + """Factory function to create appropriate handler""" + if '<|channel|>' in template_str: + return ChannelTemplateHandler(template_str) + return LinearTemplateHandler(template_str) + + def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True): ''' Given a Jinja template, reverse-engineers the prefix and the suffix for @@ -270,6 +398,15 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.append({"role": "user", "content": user_input}) + # Create template handler based on current template + template_str = state['instruction_template_str'] if state['mode'] == 'instruct' else chat_template_str + handler = create_template_handler(template_str) + + # Check impersonate support early + if impersonate and not handler.supports_impersonate(): + logger.warning("Impersonate not supported for channel-based templates") + return "" + def make_prompt(messages): if state['mode'] == 'chat-instruct' and _continue: prompt = renderer(messages=messages[:-1]) @@ -287,10 +424,10 @@ def generate_chat_prompt(user_input, state, **kwargs): command = replace_character_names(command, state['name1'], state['name2']) if _continue: - prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0] + prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0] prefix += messages[-1]["content"] else: - prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] + prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate)[0] if not impersonate: prefix = apply_extensions('bot_prefix', prefix, state) @@ -298,16 +435,14 @@ def generate_chat_prompt(user_input, state, **kwargs): outer_messages.append({"role": "assistant", "content": prefix}) prompt = instruct_renderer(messages=outer_messages) - suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1] + suffix = handler.get_generation_prefix_suffix(instruct_renderer, impersonate=False)[1] if len(suffix) > 0: prompt = prompt[:-len(suffix)] else: if _continue: - suffix = get_generation_prompt(renderer, impersonate=impersonate)[1] - if len(suffix) > 0: - prompt = prompt[:-len(suffix)] + prompt = handler.modify_for_continue(prompt, renderer, impersonate) else: - prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] + prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate)[0] if state['mode'] == 'chat' and not impersonate: prefix = apply_extensions('bot_prefix', prefix, state) @@ -433,31 +568,16 @@ def get_stopping_strings(state): if state['mode'] in ['instruct', 'chat-instruct']: template = jinja_env.from_string(state['instruction_template_str']) renderer = partial(template.render, add_generation_prompt=False) - renderers.append(renderer) + renderers.append((renderer, state['instruction_template_str'])) if state['mode'] in ['chat', 'chat-instruct']: template = jinja_env.from_string(state['chat_template_str']) renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2']) - renderers.append(renderer) + renderers.append((renderer, state['chat_template_str'])) - for renderer in renderers: - prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False) - prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True) - - stopping_strings += [ - suffix_user + prefix_bot, - suffix_user + prefix_user, - suffix_bot + prefix_bot, - suffix_bot + prefix_user, - ] - - # Try to find the EOT token - for item in stopping_strings.copy(): - item = item.strip() - if item.startswith("<") and ">" in item: - stopping_strings.append(item.split(">")[0] + ">") - elif item.startswith("[") and "]" in item: - stopping_strings.append(item.split("]")[0] + "]") + for renderer, template_str in renderers: + handler = create_template_handler(template_str) + stopping_strings += handler.get_stopping_strings(renderer) if 'stopping_strings' in state and isinstance(state['stopping_strings'], list): stopping_strings += state.pop('stopping_strings') @@ -809,6 +929,16 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess def impersonate_wrapper(textbox, state): + # Check template support first + template_str = state['chat_template_str'] + handler = create_template_handler(template_str) + + if not handler.supports_impersonate(): + logger.warning("Impersonate not supported for channel-based templates") + static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + yield textbox, static_output + return + text = textbox['text'] static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) From ecd16d6bf9f680ba5b25eb837bf61569dde81886 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 12:57:49 -0700 Subject: [PATCH 10/27] Automatically set skip_special_tokens to False for channel-based templates --- modules/chat.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/modules/chat.py b/modules/chat.py index f929f653..46d24a6f 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -770,6 +770,16 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess output = apply_extensions('history', output) state = apply_extensions('state', state) + # Automatically set skip_special_tokens to False for channel-based templates + if state['mode'] in ['instruct', 'chat-instruct']: + template_str = state['instruction_template_str'] + else: # chat mode + template_str = state['chat_template_str'] + + handler = create_template_handler(template_str) + if isinstance(handler, ChannelTemplateHandler): + state['skip_special_tokens'] = False + # Let the jinja2 template handle the BOS token if state['mode'] in ['instruct', 'chat-instruct']: state['add_bos_token'] = False From 5c5a4dfc140d3e6558c97fc84c430faa2444ef28 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 13:03:18 -0700 Subject: [PATCH 11/27] Fix impersonate --- modules/chat.py | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 46d24a6f..043908c9 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -105,10 +105,6 @@ class TemplateHandler: """Modify prompt for continue mode""" return prompt - def supports_impersonate(self): - """Whether impersonate mode is supported""" - return False - class LinearTemplateHandler(TemplateHandler): """Handles traditional linear templates""" @@ -171,41 +167,41 @@ class LinearTemplateHandler(TemplateHandler): return prompt[:-len(suffix)] return prompt - def supports_impersonate(self): - return True - class ChannelTemplateHandler(TemplateHandler): """Handles channel-based templates""" def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True): """ - Gets the string to add to the prompt to start a new generation. + Gets the string to add to the prompt to start a new turn. """ - dummy_message = [{'role': 'user', 'content': '...'}] - prompt_without_gen = renderer(messages=dummy_message, add_generation_prompt=False) - prompt_with_gen = renderer(messages=dummy_message, add_generation_prompt=True) - generation_prompt = prompt_with_gen[len(prompt_without_gen):] + if impersonate: + # For impersonate mode, we need the prefix for a user's turn. + prefix = "<|start|>user<|message|>" + else: + # For a normal reply, we need the prefix for the assistant's turn. + prefix = "<|start|>assistant" if strip_trailing_spaces: - generation_prompt = generation_prompt.rstrip(' ') + prefix = prefix.rstrip(' ') - return generation_prompt, "" + # The suffix is not needed for this template type's generation logic. + return prefix, "" def get_stopping_strings(self, renderer): + # Use specific tokens that unambiguously signal the end of a turn + # or the start of a different character's turn. return [ '<|return|>', '<|start|>user', '<|start|>developer', - '<|call|>' + '<|call|>', ] def modify_for_continue(self, prompt, renderer, impersonate=False): + # Channels don't need suffix stripping for the continue logic to work. return prompt - def supports_impersonate(self): - return False - def create_template_handler(template_str): """Factory function to create appropriate handler""" @@ -402,11 +398,6 @@ def generate_chat_prompt(user_input, state, **kwargs): template_str = state['instruction_template_str'] if state['mode'] == 'instruct' else chat_template_str handler = create_template_handler(template_str) - # Check impersonate support early - if impersonate and not handler.supports_impersonate(): - logger.warning("Impersonate not supported for channel-based templates") - return "" - def make_prompt(messages): if state['mode'] == 'chat-instruct' and _continue: prompt = renderer(messages=messages[:-1]) @@ -943,12 +934,6 @@ def impersonate_wrapper(textbox, state): template_str = state['chat_template_str'] handler = create_template_handler(template_str) - if not handler.supports_impersonate(): - logger.warning("Impersonate not supported for channel-based templates") - static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - yield textbox, static_output - return - text = textbox['text'] static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) From 42e3a7a5ae7011d12987f23056220c506af69af6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 14:56:12 -0700 Subject: [PATCH 12/27] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_cuda128.txt | 4 ++-- requirements/full/requirements_cuda128_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 38 insertions(+), 38 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 3a30a6c7..dd1e8d35 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index ae269bc9..b65f0b09 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 14871b4b..6e698654 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 49357939..84abd394 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index de33cdb8..2deefbc4 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 58496d9d..8c1baf04 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index c5322076..67a44432 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 804ef934..9fe3c54b 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 06d93d65..50602d8d 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index f9e5fb73..abdcfc16 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 53479a80..30d7d9e4 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index d7336d2f..a7c7808a 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 1edaa515..b1f66f56 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 04c9b283..76530338 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 3c3563d3..26235b83 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index cf0d7b11..cfa76310 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 9bd8a37c..2f8c401d 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index b8519553..e0650575 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 6bb8212731db0dddb00d10494e56223718401d4c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:06:22 -0700 Subject: [PATCH 13/27] Fix thinking block rendering for GPT-OSS --- modules/html_generator.py | 63 ++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/modules/html_generator.py b/modules/html_generator.py index 6844c244..8777acf7 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -116,29 +116,56 @@ def extract_thinking_block(string): THINK_START_TAG = "<think>" THINK_END_TAG = "</think>" - # Look for think tag + # Look for think tag first start_pos = string.find(THINK_START_TAG) end_pos = string.find(THINK_END_TAG) - # Return if neither tag is in string - if start_pos == -1 and end_pos == -1: - return None, string + # If think tags found, use existing logic + if start_pos != -1 or end_pos != -1: + # handle missing start or end tags + if start_pos == -1: + thought_start = 0 + else: + thought_start = start_pos + len(THINK_START_TAG) + if end_pos == -1: + thought_end = len(string) + content_start = len(string) + else: + thought_end = end_pos + content_start = end_pos + len(THINK_END_TAG) + thinking_content = string[thought_start:thought_end] + remaining_content = string[content_start:] + return thinking_content, remaining_content - # handle missing start or end tags - if start_pos == -1: - thought_start = 0 - else: - thought_start = start_pos + len(THINK_START_TAG) - if end_pos == -1: - thought_end = len(string) - content_start = len(string) - else: - thought_end = end_pos - content_start = end_pos + len(THINK_END_TAG) + # If think tags not found, try alternative format + ALT_START = "<|channel|>analysis<|message|>" + ALT_END = "<|end|>" + ALT_CONTENT_START = "<|start|>assistant<|channel|>final<|message|>" - thinking_content = string[thought_start:thought_end] - remaining_content = string[content_start:] - return thinking_content, remaining_content + alt_start_pos = string.find(ALT_START) + alt_end_pos = string.find(ALT_END) + alt_content_pos = string.find(ALT_CONTENT_START) + + # Check if start tag or end tag is found + if alt_start_pos != -1 or alt_end_pos != -1: + if alt_start_pos == -1: + thought_start = 0 + else: + thought_start = alt_start_pos + len(ALT_START) + + if alt_end_pos == -1: + thought_end = len(string) + content_start = len(string) + else: + thought_end = alt_end_pos + content_start = alt_content_pos + len(ALT_CONTENT_START) if alt_content_pos != -1 else len(string) + + thinking_content = string[thought_start:thought_end] + remaining_content = string[content_start:] + return thinking_content, remaining_content + + # Return if neither format is found + return None, string @functools.lru_cache(maxsize=None) From 498778b8ac85990158713c1925ca657d1fa135c8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:19:11 -0700 Subject: [PATCH 14/27] Add a new 'Reasoning effort' UI element --- extensions/openai/typing.py | 1 + modules/chat.py | 3 ++- modules/loaders.py | 5 +++++ modules/shared.py | 1 + modules/ui.py | 2 ++ modules/ui_chat.py | 3 ++- 6 files changed, 13 insertions(+), 2 deletions(-) diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index 6643ed16..6bd3749f 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -43,6 +43,7 @@ class GenerationOptions(BaseModel): ban_eos_token: bool = False add_bos_token: bool = True enable_thinking: bool = True + reasoning_effort: str = "medium" skip_special_tokens: bool = True static_cache: bool = False truncation_length: int = 0 diff --git a/modules/chat.py b/modules/chat.py index 043908c9..dd923d67 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -306,7 +306,8 @@ def generate_chat_prompt(user_input, state, **kwargs): builtin_tools=None, tools=state['tools'] if 'tools' in state else None, tools_in_user_message=False, - add_generation_prompt=False + add_generation_prompt=False, + reasoning_effort=state.get('reasoning_effort', 'medium') ) chat_renderer = partial( diff --git a/modules/loaders.py b/modules/loaders.py index f515aeca..7546bc5b 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -137,6 +137,7 @@ def transformers_samplers(): 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'static_cache', 'seed', @@ -189,6 +190,7 @@ loaders_samplers = { 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'seed', 'sampler_priority', @@ -236,6 +238,7 @@ loaders_samplers = { 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'seed', 'sampler_priority', @@ -275,6 +278,7 @@ loaders_samplers = { 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'seed', 'custom_token_bans', @@ -308,6 +312,7 @@ loaders_samplers = { 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'seed', 'sampler_priority', 'dry_sequence_breakers', diff --git a/modules/shared.py b/modules/shared.py index 5e3e11c0..ab5198d1 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -211,6 +211,7 @@ settings = { 'ban_eos_token': False, 'add_bos_token': True, 'enable_thinking': True, + 'reasoning_effort': 'medium', 'skip_special_tokens': True, 'stream': True, 'static_cache': False, diff --git a/modules/ui.py b/modules/ui.py index 98acc038..e7805046 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -215,6 +215,7 @@ def list_interface_input_elements(): 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'stream', 'static_cache', @@ -482,6 +483,7 @@ def setup_auto_save(): 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'stream', 'static_cache', diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 4dade176..1d85a398 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -78,7 +78,8 @@ def create_ui(): with gr.Row(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) - shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle mode.') + shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.') + shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.') shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search') with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']: shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10) From 0e42575c57b374e2b652b15cc2e03daec3170bc6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:36:20 -0700 Subject: [PATCH 15/27] Fix thinking block parsing for GPT-OSS under llama.cpp --- modules/html_generator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/modules/html_generator.py b/modules/html_generator.py index 8777acf7..79237f7f 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -146,19 +146,23 @@ def extract_thinking_block(string): alt_end_pos = string.find(ALT_END) alt_content_pos = string.find(ALT_CONTENT_START) - # Check if start tag or end tag is found if alt_start_pos != -1 or alt_end_pos != -1: if alt_start_pos == -1: thought_start = 0 else: thought_start = alt_start_pos + len(ALT_START) + # If no explicit end tag but content start exists, use content start as end if alt_end_pos == -1: - thought_end = len(string) - content_start = len(string) + if alt_content_pos != -1: + thought_end = alt_content_pos + content_start = alt_content_pos + len(ALT_CONTENT_START) + else: + thought_end = len(string) + content_start = len(string) else: thought_end = alt_end_pos - content_start = alt_content_pos + len(ALT_CONTENT_START) if alt_content_pos != -1 else len(string) + content_start = alt_content_pos + len(ALT_CONTENT_START) if alt_content_pos != -1 else alt_end_pos + len(ALT_END) thinking_content = string[thought_start:thought_end] remaining_content = string[content_start:] From 7d98ca619558c9b77fa26130e3656b0bf8843341 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:43:44 -0700 Subject: [PATCH 16/27] Make web search functional with thinking models --- modules/chat.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index dd923d67..dbc0e6f6 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -307,7 +307,7 @@ def generate_chat_prompt(user_input, state, **kwargs): tools=state['tools'] if 'tools' in state else None, tools_in_user_message=False, add_generation_prompt=False, - reasoning_effort=state.get('reasoning_effort', 'medium') + reasoning_effort=state['reasoning_effort']) ) chat_renderer = partial( @@ -730,9 +730,9 @@ def generate_search_query(user_message, state): # Use a minimal state for search query generation but keep the full history search_state = state.copy() - search_state['max_new_tokens'] = 64 - search_state['auto_max_new_tokens'] = False + search_state['auto_max_new_tokens'] = True search_state['enable_thinking'] = False + search_state['reasoning_effort'] = 'low' search_state['start_with'] = "" # Generate the full prompt using existing history + augmented message @@ -742,6 +742,12 @@ def generate_search_query(user_message, state): for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True): query = reply + # Check for thinking block delimiters and extract content after them + if "" in query: + query = query.rsplit("", 1)[1] + elif "<|start|>assistant<|channel|>final<|message|>" in query: + query = query.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1] + # Strip and remove surrounding quotes if present query = query.strip() if len(query) >= 2 and query.startswith('"') and query.endswith('"'): From 701048cf336946177ea216d3456e5f7cdd1cab85 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:51:24 -0700 Subject: [PATCH 17/27] Try to avoid breaking jinja2 parsing for older models --- modules/models_settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/models_settings.py b/modules/models_settings.py index 8ed7f953..3c068df0 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -91,6 +91,7 @@ def get_model_metadata(model): template = template.replace('bos_token', "'{}'".format(bos_token)) template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) + template = re.sub(r'raise_exception\([^)]*\)', "''", template) template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template @@ -150,6 +151,7 @@ def get_model_metadata(model): template = template.replace(k, "'{}'".format(value)) template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) + template = re.sub(r'raise_exception\([^)]*\)', "''", template) template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template From e5b8d4d072f74281071a9ad911bede662d61767e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:52:56 -0700 Subject: [PATCH 18/27] Fix a typo --- modules/chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index dbc0e6f6..1f4e2af0 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -307,7 +307,7 @@ def generate_chat_prompt(user_input, state, **kwargs): tools=state['tools'] if 'tools' in state else None, tools_in_user_message=False, add_generation_prompt=False, - reasoning_effort=state['reasoning_effort']) + reasoning_effort=state['reasoning_effort'] ) chat_renderer = partial( From 80f6abb07e44cb70d65a0d43fe9676a02880eb2c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 16:01:19 -0700 Subject: [PATCH 19/27] Begin fixing 'Continue' with GPT-OSS --- modules/chat.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 1f4e2af0..b23340aa 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -165,6 +165,7 @@ class LinearTemplateHandler(TemplateHandler): suffix = self.get_generation_prefix_suffix(renderer, impersonate)[1] if len(suffix) > 0: return prompt[:-len(suffix)] + return prompt @@ -199,7 +200,10 @@ class ChannelTemplateHandler(TemplateHandler): ] def modify_for_continue(self, prompt, renderer, impersonate=False): - # Channels don't need suffix stripping for the continue logic to work. + suffix = '<|return|>' + if prompt.endswith(suffix): + return prompt[:-len(suffix)] + return prompt From 20adc3c96737e35b96f6b1d557a63b1d2c75a825 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 16:58:45 -0700 Subject: [PATCH 20/27] Start over new template handling (to avoid overcomplicating) --- modules/chat.py | 192 +++++++----------------------------------------- 1 file changed, 28 insertions(+), 164 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index b23340aa..82760cc8 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -86,134 +86,6 @@ yaml.add_representer(str, str_presenter) yaml.representer.SafeRepresenter.add_representer(str, str_presenter) -# Template Handler Classes -class TemplateHandler: - """Base class for handling different template types""" - - def __init__(self, template_str): - self.template_str = template_str - - def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True): - """Get prefix/suffix for generation""" - return "", "" - - def get_stopping_strings(self, renderer): - """Get stopping strings for this template type""" - return [] - - def modify_for_continue(self, prompt, renderer, impersonate=False): - """Modify prompt for continue mode""" - return prompt - - -class LinearTemplateHandler(TemplateHandler): - """Handles traditional linear templates""" - - def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True): - # This is the original, complex logic for deriving prefix/suffix for old templates. - if impersonate: - messages = [ - {"role": "user", "content": "<<|user-message-1|>>"}, - {"role": "user", "content": "<<|user-message-2|>>"}, - ] - else: - messages = [ - {"role": "assistant", "content": "<<|user-message-1|>>"}, - {"role": "assistant", "content": "<<|user-message-2|>>"}, - ] - - prompt = renderer(messages=messages) - suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0] - suffix = prompt.split("<<|user-message-2|>>")[1] - - if '<|start|>user' in suffix_plus_prefix or '<|start|>assistant' in suffix_plus_prefix: - start_index = suffix_plus_prefix.rindex('<|start|>') - prefix = suffix_plus_prefix[start_index:] - else: - prefix = suffix_plus_prefix[len(suffix):] - - if strip_trailing_spaces: - prefix = prefix.rstrip(' ') - - return prefix, suffix - - def get_stopping_strings(self, renderer): - # This is the original, correct logic for dynamically creating stopping strings for linear templates. - prefix_bot, suffix_bot = self.get_generation_prefix_suffix(renderer, impersonate=False) - prefix_user, suffix_user = self.get_generation_prefix_suffix(renderer, impersonate=True) - - stopping_strings = [ - suffix_user + prefix_bot, - suffix_user + prefix_user, - suffix_bot + prefix_bot, - suffix_bot + prefix_user, - ] - - # Attempt to find a single EOT token to use as a stop string - for item in stopping_strings: - item = item.strip() - if item.startswith("<") and ">" in item: - stopping_strings.append(item.split(">")[0] + ">") - break - elif item.startswith("[") and "]" in item: - stopping_strings.append(item.split("]")[0] + "]") - break - - return stopping_strings - - def modify_for_continue(self, prompt, renderer, impersonate=False): - suffix = self.get_generation_prefix_suffix(renderer, impersonate)[1] - if len(suffix) > 0: - return prompt[:-len(suffix)] - - return prompt - - -class ChannelTemplateHandler(TemplateHandler): - """Handles channel-based templates""" - - def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True): - """ - Gets the string to add to the prompt to start a new turn. - """ - if impersonate: - # For impersonate mode, we need the prefix for a user's turn. - prefix = "<|start|>user<|message|>" - else: - # For a normal reply, we need the prefix for the assistant's turn. - prefix = "<|start|>assistant" - - if strip_trailing_spaces: - prefix = prefix.rstrip(' ') - - # The suffix is not needed for this template type's generation logic. - return prefix, "" - - def get_stopping_strings(self, renderer): - # Use specific tokens that unambiguously signal the end of a turn - # or the start of a different character's turn. - return [ - '<|return|>', - '<|start|>user', - '<|start|>developer', - '<|call|>', - ] - - def modify_for_continue(self, prompt, renderer, impersonate=False): - suffix = '<|return|>' - if prompt.endswith(suffix): - return prompt[:-len(suffix)] - - return prompt - - -def create_template_handler(template_str): - """Factory function to create appropriate handler""" - if '<|channel|>' in template_str: - return ChannelTemplateHandler(template_str) - return LinearTemplateHandler(template_str) - - def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True): ''' Given a Jinja template, reverse-engineers the prefix and the suffix for @@ -236,14 +108,7 @@ def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=Tru suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0] suffix = prompt.split("<<|user-message-2|>>")[1] - - # Remove the message suffix. The first case handles the GPT-OSS model - # in a way that is likely to not interfere with previous models. - if '<|start|>user' in suffix_plus_prefix or '<|start|>assistant' in suffix_plus_prefix: - start_index = suffix_plus_prefix.rindex('<|start|>') - prefix = suffix_plus_prefix[start_index:] - else: - prefix = suffix_plus_prefix[len(suffix):] + prefix = suffix_plus_prefix[len(suffix):] if strip_trailing_spaces: prefix = prefix.rstrip(' ') @@ -399,10 +264,6 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.append({"role": "user", "content": user_input}) - # Create template handler based on current template - template_str = state['instruction_template_str'] if state['mode'] == 'instruct' else chat_template_str - handler = create_template_handler(template_str) - def make_prompt(messages): if state['mode'] == 'chat-instruct' and _continue: prompt = renderer(messages=messages[:-1]) @@ -420,10 +281,10 @@ def generate_chat_prompt(user_input, state, **kwargs): command = replace_character_names(command, state['name1'], state['name2']) if _continue: - prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0] + prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0] prefix += messages[-1]["content"] else: - prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate)[0] + prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] if not impersonate: prefix = apply_extensions('bot_prefix', prefix, state) @@ -431,14 +292,16 @@ def generate_chat_prompt(user_input, state, **kwargs): outer_messages.append({"role": "assistant", "content": prefix}) prompt = instruct_renderer(messages=outer_messages) - suffix = handler.get_generation_prefix_suffix(instruct_renderer, impersonate=False)[1] + suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1] if len(suffix) > 0: prompt = prompt[:-len(suffix)] else: if _continue: - prompt = handler.modify_for_continue(prompt, renderer, impersonate) + suffix = get_generation_prompt(renderer, impersonate=impersonate)[1] + if len(suffix) > 0: + prompt = prompt[:-len(suffix)] else: - prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate)[0] + prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] if state['mode'] == 'chat' and not impersonate: prefix = apply_extensions('bot_prefix', prefix, state) @@ -564,16 +427,31 @@ def get_stopping_strings(state): if state['mode'] in ['instruct', 'chat-instruct']: template = jinja_env.from_string(state['instruction_template_str']) renderer = partial(template.render, add_generation_prompt=False) - renderers.append((renderer, state['instruction_template_str'])) + renderers.append(renderer) if state['mode'] in ['chat', 'chat-instruct']: template = jinja_env.from_string(state['chat_template_str']) renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2']) - renderers.append((renderer, state['chat_template_str'])) + renderers.append(renderer) - for renderer, template_str in renderers: - handler = create_template_handler(template_str) - stopping_strings += handler.get_stopping_strings(renderer) + for renderer in renderers: + prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False) + prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True) + + stopping_strings += [ + suffix_user + prefix_bot, + suffix_user + prefix_user, + suffix_bot + prefix_bot, + suffix_bot + prefix_user, + ] + + # Try to find the EOT token + for item in stopping_strings.copy(): + item = item.strip() + if item.startswith("<") and ">" in item: + stopping_strings.append(item.split(">")[0] + ">") + elif item.startswith("[") and "]" in item: + stopping_strings.append(item.split("]")[0] + "]") if 'stopping_strings' in state and isinstance(state['stopping_strings'], list): stopping_strings += state.pop('stopping_strings') @@ -772,16 +650,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess output = apply_extensions('history', output) state = apply_extensions('state', state) - # Automatically set skip_special_tokens to False for channel-based templates - if state['mode'] in ['instruct', 'chat-instruct']: - template_str = state['instruction_template_str'] - else: # chat mode - template_str = state['chat_template_str'] - - handler = create_template_handler(template_str) - if isinstance(handler, ChannelTemplateHandler): - state['skip_special_tokens'] = False - # Let the jinja2 template handle the BOS token if state['mode'] in ['instruct', 'chat-instruct']: state['add_bos_token'] = False @@ -941,10 +809,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess def impersonate_wrapper(textbox, state): - # Check template support first - template_str = state['chat_template_str'] - handler = create_template_handler(template_str) - text = textbox['text'] static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) From bfbbfc2361e26b03e5af9a26434391be9fd257f1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 17:33:01 -0700 Subject: [PATCH 21/27] Ignore add_generation_prompt in GPT-OSS --- modules/models_settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/models_settings.py b/modules/models_settings.py index 3c068df0..e35e1c04 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -93,6 +93,7 @@ def get_model_metadata(model): template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) template = re.sub(r'raise_exception\([^)]*\)', "''", template) template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) + template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template @@ -153,6 +154,7 @@ def get_model_metadata(model): template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) template = re.sub(r'raise_exception\([^)]*\)', "''", template) template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) + template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template From fbea21a1f13186740012c55aa7877a5aeda89c2f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 17:33:27 -0700 Subject: [PATCH 22/27] Only use enable_thinking if the template supports it --- modules/chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 82760cc8..e7fd86f9 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -307,7 +307,7 @@ def generate_chat_prompt(user_input, state, **kwargs): prompt += prefix - if state['mode'] == 'instruct' and not any((_continue, impersonate, state['enable_thinking'])): + if state['mode'] == 'instruct' and 'enable_thinking' in state['instruction_template_str'] and not any((_continue, impersonate, state['enable_thinking'])): prompt += get_thinking_suppression_string(instruction_template) return prompt From 7c82d65a9d071342cc501246760f8a875e5097a7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 5 Aug 2025 18:05:09 -0700 Subject: [PATCH 23/27] Handle GPT-OSS as a special template case --- modules/chat.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/chat.py b/modules/chat.py index e7fd86f9..66f89c70 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -302,6 +302,13 @@ def generate_chat_prompt(user_input, state, **kwargs): prompt = prompt[:-len(suffix)] else: prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] + + # Handle GPT-OSS as a special case + if '<|channel|>final<|message|>' in state['instruction_template_str']: + prefix = prefix.rstrip("<|channel|>final<|message|>") + if impersonate: + prefix += "<|message|>" + if state['mode'] == 'chat' and not impersonate: prefix = apply_extensions('bot_prefix', prefix, state) @@ -460,6 +467,12 @@ def get_stopping_strings(state): result = [item for item in stopping_strings if not any(item.startswith(other) and item != other for other in stopping_strings)] result = list(set(result)) + # Handle GPT-OSS as a special case + if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result: + result.remove("<|end|>") + result.append("<|result|>") + result = list(set(result)) + if shared.args.verbose: logger.info("STOPPING_STRINGS=") pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(result) @@ -650,6 +663,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess output = apply_extensions('history', output) state = apply_extensions('state', state) + # Handle GPT-OSS as a special case + if '<|channel|>final<|message|>' in state['instruction_template_str']: + state['skip_special_tokens'] = False + # Let the jinja2 template handle the BOS token if state['mode'] in ['instruct', 'chat-instruct']: state['add_bos_token'] = False From 6ce4b353c49a1b9b86cf842a1d30ec2198f5d9b7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 6 Aug 2025 06:42:45 -0700 Subject: [PATCH 24/27] Fix the GPT-OSS template --- modules/chat.py | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 66f89c70..e07dfd1c 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -211,7 +211,39 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.insert(insert_pos, {"role": "tool", "content": tool_msg}) if assistant_msg: - messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) + # Handle GPT-OSS as a special case + if '<|channel|>analysis<|message|>' in assistant_msg or '<|channel|>final<|message|>' in assistant_msg: + + thinking_content = "" + final_content = "" + + # Extract analysis content if present + if '<|channel|>analysis<|message|>' in assistant_msg: + analysis_start = assistant_msg.find('<|channel|>analysis<|message|>') + len('<|channel|>analysis<|message|>') + if '<|start|>assistant<|channel|>final<|message|>' in assistant_msg: + analysis_end = assistant_msg.find('<|start|>assistant<|channel|>final<|message|>') + else: + analysis_end = len(assistant_msg) + + thinking_content = assistant_msg[analysis_start:analysis_end].strip() + + # Extract final content if present + if '<|start|>assistant<|channel|>final<|message|>' in assistant_msg: + final_start = assistant_msg.find('<|start|>assistant<|channel|>final<|message|>') + len('<|start|>assistant<|channel|>final<|message|>') + final_content = assistant_msg[final_start:].strip() + elif '<|channel|>final<|message|>' in assistant_msg: + final_start = assistant_msg.find('<|channel|>final<|message|>') + len('<|channel|>final<|message|>') + final_content = assistant_msg[final_start:].strip() + + # Insert as structured message + msg_dict = {"role": "assistant", "content": final_content} + if thinking_content: + msg_dict["thinking"] = thinking_content + + messages.insert(insert_pos, msg_dict) + + else: + messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']: # Check for user message attachments in metadata @@ -305,7 +337,9 @@ def generate_chat_prompt(user_input, state, **kwargs): # Handle GPT-OSS as a special case if '<|channel|>final<|message|>' in state['instruction_template_str']: - prefix = prefix.rstrip("<|channel|>final<|message|>") + if prefix.endswith("<|channel|>final<|message|>"): + prefix = prefix[:-len("<|channel|>final<|message|>")] + if impersonate: prefix += "<|message|>" From 0c1403f2c72133e1ff63154d21f37954a2e1c343 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 6 Aug 2025 08:05:37 -0700 Subject: [PATCH 25/27] Handle GPT-OSS as a special case when continuing --- modules/chat.py | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index e07dfd1c..64588b9d 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -237,7 +237,7 @@ def generate_chat_prompt(user_input, state, **kwargs): # Insert as structured message msg_dict = {"role": "assistant", "content": final_content} - if thinking_content: + if '<|channel|>analysis<|message|>' in assistant_msg: msg_dict["thinking"] = thinking_content messages.insert(insert_pos, msg_dict) @@ -328,25 +328,42 @@ def generate_chat_prompt(user_input, state, **kwargs): if len(suffix) > 0: prompt = prompt[:-len(suffix)] else: - if _continue: - suffix = get_generation_prompt(renderer, impersonate=impersonate)[1] - if len(suffix) > 0: - prompt = prompt[:-len(suffix)] + # Handle GPT-OSS as a special case when continuing + if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']: + # This prevents the template from stripping the analysis block of the message being continued. + + last_message_to_continue = messages[-1] + prompt = renderer(messages=messages[:-1]) + + assistant_reply_so_far = "" + if 'thinking' in last_message_to_continue: + assistant_reply_so_far += f"<|start|>assistant<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>" + + assistant_reply_so_far += f"<|start|>assistant<|channel|>final<|message|>{last_message_to_continue.get('content', '')}" + + prompt += assistant_reply_so_far + else: - prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] + prompt = renderer(messages=messages) + if _continue: + suffix = get_generation_prompt(renderer, impersonate=impersonate)[1] + if len(suffix) > 0: + prompt = prompt[:-len(suffix)] + else: + prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] - # Handle GPT-OSS as a special case - if '<|channel|>final<|message|>' in state['instruction_template_str']: - if prefix.endswith("<|channel|>final<|message|>"): - prefix = prefix[:-len("<|channel|>final<|message|>")] + # Handle GPT-OSS as a special case when not continuing + if '<|channel|>final<|message|>' in state['instruction_template_str']: + if prefix.endswith("<|channel|>final<|message|>"): + prefix = prefix[:-len("<|channel|>final<|message|>")] - if impersonate: - prefix += "<|message|>" + if impersonate: + prefix += "<|message|>" - if state['mode'] == 'chat' and not impersonate: - prefix = apply_extensions('bot_prefix', prefix, state) + if state['mode'] == 'chat' and not impersonate: + prefix = apply_extensions('bot_prefix', prefix, state) - prompt += prefix + prompt += prefix if state['mode'] == 'instruct' and 'enable_thinking' in state['instruction_template_str'] and not any((_continue, impersonate, state['enable_thinking'])): prompt += get_thinking_suppression_string(instruction_template) From 3e24f455c8cca90310d5a3f9db31ed2007520db3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 6 Aug 2025 10:18:42 -0700 Subject: [PATCH 26/27] Fix continue for GPT-OSS (hopefully the final fix) --- modules/chat.py | 54 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 64588b9d..1ab91b5e 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -219,21 +219,39 @@ def generate_chat_prompt(user_input, state, **kwargs): # Extract analysis content if present if '<|channel|>analysis<|message|>' in assistant_msg: - analysis_start = assistant_msg.find('<|channel|>analysis<|message|>') + len('<|channel|>analysis<|message|>') - if '<|start|>assistant<|channel|>final<|message|>' in assistant_msg: - analysis_end = assistant_msg.find('<|start|>assistant<|channel|>final<|message|>') - else: - analysis_end = len(assistant_msg) + # Split the message by the analysis tag to isolate the content that follows + parts = assistant_msg.split('<|channel|>analysis<|message|>', 1) + if len(parts) > 1: + # The content is everything after the tag + potential_content = parts[1] - thinking_content = assistant_msg[analysis_start:analysis_end].strip() + # Now, find the end of this content block + analysis_end_tag = '<|end|>' + if analysis_end_tag in potential_content: + thinking_content = potential_content.split(analysis_end_tag, 1)[0].strip() + else: + # Fallback: if no <|end|> tag, stop at the start of the final channel if it exists + final_channel_tag = '<|channel|>final<|message|>' + if final_channel_tag in potential_content: + thinking_content = potential_content.split(final_channel_tag, 1)[0].strip() + else: + thinking_content = potential_content.strip() # Extract final content if present - if '<|start|>assistant<|channel|>final<|message|>' in assistant_msg: - final_start = assistant_msg.find('<|start|>assistant<|channel|>final<|message|>') + len('<|start|>assistant<|channel|>final<|message|>') - final_content = assistant_msg[final_start:].strip() - elif '<|channel|>final<|message|>' in assistant_msg: - final_start = assistant_msg.find('<|channel|>final<|message|>') + len('<|channel|>final<|message|>') - final_content = assistant_msg[final_start:].strip() + final_tag_to_find = '<|channel|>final<|message|>' + if final_tag_to_find in assistant_msg: + # Split the message by the final tag to isolate the content that follows + parts = assistant_msg.split(final_tag_to_find, 1) + if len(parts) > 1: + # The content is everything after the tag + potential_content = parts[1] + + # Now, find the end of this content block + final_end_tag = '<|end|>' + if final_end_tag in potential_content: + final_content = potential_content.split(final_end_tag, 1)[0].strip() + else: + final_content = potential_content.strip() # Insert as structured message msg_dict = {"role": "assistant", "content": final_content} @@ -330,16 +348,16 @@ def generate_chat_prompt(user_input, state, **kwargs): else: # Handle GPT-OSS as a special case when continuing if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']: - # This prevents the template from stripping the analysis block of the message being continued. - last_message_to_continue = messages[-1] prompt = renderer(messages=messages[:-1]) - assistant_reply_so_far = "" - if 'thinking' in last_message_to_continue: - assistant_reply_so_far += f"<|start|>assistant<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>" + # Start the assistant turn wrapper + assistant_reply_so_far = "<|start|>assistant" - assistant_reply_so_far += f"<|start|>assistant<|channel|>final<|message|>{last_message_to_continue.get('content', '')}" + if 'thinking' in last_message_to_continue: + assistant_reply_so_far += f"<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>" + + assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}" prompt += assistant_reply_so_far From f1147c992618ee17a7f5a37331d99d00ad02fd79 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 6 Aug 2025 19:32:36 -0700 Subject: [PATCH 27/27] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_cuda128.txt | 4 ++-- requirements/full/requirements_cuda128_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 38 insertions(+), 38 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index dd1e8d35..f17cae8a 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index b65f0b09..51f4571f 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 6e698654..37021c77 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 84abd394..f54ae191 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 2deefbc4..e495455b 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 8c1baf04..72847534 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 67a44432..ed641a24 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 9fe3c54b..d7fe735b 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 50602d8d..cb71f74b 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index abdcfc16..d6bed576 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 30d7d9e4..1f17dc50 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index a7c7808a..82254842 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index b1f66f56..986a3d49 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 76530338..833e923b 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 26235b83..6a894d49 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index cfa76310..0afb19c2 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 2f8c401d..a404f50c 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index e0650575..75176656 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"