From d746484521c527f64b66264bc9d3ecc22b7461c2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 23 Jul 2025 11:52:51 -0700
Subject: [PATCH 01/27] Handle both int and str types in grammar char
 processing

---
 modules/grammar/grammar_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/grammar/grammar_utils.py b/modules/grammar/grammar_utils.py
index 7f09ff82..af78f6b9 100644
--- a/modules/grammar/grammar_utils.py
+++ b/modules/grammar/grammar_utils.py
@@ -463,7 +463,7 @@ class IncrementalGrammarConstraint(GrammarConstraint):
         super().__init__(grammar_str, start_rule_name, tokenizer)
 
     def accept_char(self, char, stacks):
-        byte = ord(char)
+        byte = char if isinstance(char, int) else ord(char)
         new_stacks = []
         for stack in stacks:
             # stack is empty
@@ -549,7 +549,7 @@ class IncrementalGrammarConstraint(GrammarConstraint):
     # For each sub-rule in the grammar, cache whether each byte is accepted.
     @lru_cache(maxsize=None)
     def pos_char_acceptance(self, pos, char):
-        byte = ord(char)
+        byte = char if isinstance(char, int) else ord(char)
         num_chars = self.grammar_encoding[pos]
         pos += 1
         for i in range(0, num_chars, 2):

From f08bb9a2012eeac213232c2fe087ba330b1801fb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Jul 2025 10:34:59 -0700
Subject: [PATCH 02/27] Handle edge case in chat history loading (closes #7155)

---
 modules/chat.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index 827b6050..1a16a689 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1175,6 +1175,9 @@ def save_last_chat_state(character, mode, unique_id):
 def load_history(unique_id, character, mode):
     p = get_history_file_path(unique_id, character, mode)
 
+    if not p.exists():
+        return {'internal': [], 'visible': [], 'metadata': {}}
+
     f = json.loads(open(p, 'rb').read())
     if 'internal' in f and 'visible' in f:
         history = f

From 74230f559ab5e8536ff22352c4910191667ab12c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 1 Aug 2025 11:03:15 -0700
Subject: [PATCH 03/27] Bump transformers to 4.54

---
 requirements/full/requirements.txt                 | 2 +-
 requirements/full/requirements_amd.txt             | 2 +-
 requirements/full/requirements_amd_noavx2.txt      | 2 +-
 requirements/full/requirements_apple_intel.txt     | 2 +-
 requirements/full/requirements_apple_silicon.txt   | 2 +-
 requirements/full/requirements_cpu_only.txt        | 2 +-
 requirements/full/requirements_cpu_only_noavx2.txt | 2 +-
 requirements/full/requirements_cuda128.txt         | 2 +-
 requirements/full/requirements_cuda128_noavx2.txt  | 2 +-
 requirements/full/requirements_noavx2.txt          | 2 +-
 requirements/full/requirements_nowheels.txt        | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 687f1f5a..9810c65a 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 8224d987..314e7d4f 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 22141a8a..c7a8ba9b 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 04325464..6ce6ae9b 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 9497575f..53128210 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 8a84e403..6ba29008 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 9488f5e7..a1bd0ffc 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index a2af5108..f21a5208 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.3.1.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index 948a275a..e76d6668 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.3.1.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 8f7106e4..5fbe49e7 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 69a82184..21588344 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.53.*
+transformers==4.54.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb

From 02a3420a506631f50bdb3dbd3d6c22ef4344c343 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 10:09:30 -0700
Subject: [PATCH 04/27] Bump transformers to 4.55 (adds gpt-oss support)

---
 requirements/full/requirements.txt                 | 2 +-
 requirements/full/requirements_amd.txt             | 2 +-
 requirements/full/requirements_amd_noavx2.txt      | 2 +-
 requirements/full/requirements_apple_intel.txt     | 2 +-
 requirements/full/requirements_apple_silicon.txt   | 2 +-
 requirements/full/requirements_cpu_only.txt        | 2 +-
 requirements/full/requirements_cpu_only_noavx2.txt | 2 +-
 requirements/full/requirements_cuda128.txt         | 2 +-
 requirements/full/requirements_cuda128_noavx2.txt  | 2 +-
 requirements/full/requirements_noavx2.txt          | 2 +-
 requirements/full/requirements_nowheels.txt        | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 9810c65a..3a30a6c7 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 314e7d4f..ae269bc9 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index c7a8ba9b..14871b4b 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 6ce6ae9b..49357939 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 53128210..de33cdb8 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 6ba29008..58496d9d 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index a1bd0ffc..c5322076 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index f21a5208..804ef934 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.3.1.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index e76d6668..06d93d65 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.3.1.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 5fbe49e7..f9e5fb73 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 21588344..cd85a744 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.54.*
+transformers==4.55.*
 triton-windows==3.2.0.post19; platform_system == "Windows"
 tqdm
 wandb

From 59890435376110f58656037b1871be553bafbb1f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 11:22:18 -0700
Subject: [PATCH 05/27] Transformers: Support standalone .jinja chat templates
 (for GPT-OSS)

---
 modules/models_settings.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index a06e594e..c4dfb0ed 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -122,13 +122,25 @@ def get_model_metadata(model):
 
     # Try to find the Jinja instruct template
     path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
+    template = None
+
+    # 1. Prioritize reading from chat_template.jinja if it exists
+    jinja_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.jinja'
+    if jinja_path.exists():
+        with open(jinja_path, 'r', encoding='utf-8') as f:
+            template = f.read()
+
     if path.exists():
         metadata = json.loads(open(path, 'r', encoding='utf-8').read())
-        if 'chat_template' in metadata:
+
+        # 2. Only read from metadata if we haven't already loaded from .jinja
+        if template is None and 'chat_template' in metadata:
             template = metadata['chat_template']
             if isinstance(template, list):
                 template = template[0]['template']
 
+        # 3. If a template was found from either source, process it
+        if template:
             for k in ['eos_token', 'bos_token']:
                 if k in metadata:
                     value = metadata[k]

From 3039aeffeb8958724de56912d0d90267b87a7074 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 11:35:17 -0700
Subject: [PATCH 06/27] Fix parsing the gpt-oss-20b template

---
 modules/models_settings.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index c4dfb0ed..8ed7f953 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -90,7 +90,7 @@ def get_model_metadata(model):
             template = template.replace('eos_token', "'{}'".format(eos_token))
             template = template.replace('bos_token', "'{}'".format(bos_token))
 
-            template = re.sub(r'raise_exception\([^)]*\)', "''", template)
+            template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
             template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
             model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
             model_settings['instruction_template_str'] = template
@@ -149,7 +149,7 @@ def get_model_metadata(model):
 
                     template = template.replace(k, "'{}'".format(value))
 
-            template = re.sub(r'raise_exception\([^)]*\)', "''", template)
+            template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
             template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
             model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
             model_settings['instruction_template_str'] = template

From 3b28dc182186308648af5cddb44c811f1608c70d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 11:35:53 -0700
Subject: [PATCH 07/27] Don't pass torch_dtype to transformers loader, let it
 be autodetected

---
 modules/transformers_loader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py
index 2f7367a4..e4072125 100644
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@@ -136,7 +136,6 @@ def load_model_HF(model_name):
     path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
     params = {
         'low_cpu_mem_usage': True,
-        'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
         'attn_implementation': shared.args.attn_implementation,
     }
 

From 9f28f53cfc7d14cf8e3c9ebd00834126f73675b4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 11:56:00 -0700
Subject: [PATCH 08/27] Better parsing of the gpt-oss template

---
 modules/chat.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 1a16a689..c10d91a7 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -108,7 +108,14 @@ def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=Tru
 
     suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0]
     suffix = prompt.split("<<|user-message-2|>>")[1]
-    prefix = suffix_plus_prefix[len(suffix):]
+
+    # Remove the message suffix. The first case handles the GPT-OSS model
+    # in a way that is likely to not interfere with previous models.
+    if '<|start|>user' in suffix_plus_prefix or '<|start|>assistant' in suffix_plus_prefix:
+        start_index = suffix_plus_prefix.rindex('<|start|>')
+        prefix = suffix_plus_prefix[start_index:]
+    else:
+        prefix = suffix_plus_prefix[len(suffix):]
 
     if strip_trailing_spaces:
         prefix = prefix.rstrip(' ')

From 178c3e75cca827657a018a64ae3d7945d9e25231 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 12:38:06 -0700
Subject: [PATCH 09/27] Handle templates with channels separately

---
 modules/chat.py | 184 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 157 insertions(+), 27 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index c10d91a7..f929f653 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -86,6 +86,134 @@ yaml.add_representer(str, str_presenter)
 yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
 
 
+# Template Handler Classes
+class TemplateHandler:
+    """Base class for handling different template types"""
+
+    def __init__(self, template_str):
+        self.template_str = template_str
+
+    def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True):
+        """Get prefix/suffix for generation"""
+        return "", ""
+
+    def get_stopping_strings(self, renderer):
+        """Get stopping strings for this template type"""
+        return []
+
+    def modify_for_continue(self, prompt, renderer, impersonate=False):
+        """Modify prompt for continue mode"""
+        return prompt
+
+    def supports_impersonate(self):
+        """Whether impersonate mode is supported"""
+        return False
+
+
+class LinearTemplateHandler(TemplateHandler):
+    """Handles traditional linear templates"""
+
+    def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True):
+        # This is the original, complex logic for deriving prefix/suffix for old templates.
+        if impersonate:
+            messages = [
+                {"role": "user", "content": "<<|user-message-1|>>"},
+                {"role": "user", "content": "<<|user-message-2|>>"},
+            ]
+        else:
+            messages = [
+                {"role": "assistant", "content": "<<|user-message-1|>>"},
+                {"role": "assistant", "content": "<<|user-message-2|>>"},
+            ]
+
+        prompt = renderer(messages=messages)
+        suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0]
+        suffix = prompt.split("<<|user-message-2|>>")[1]
+
+        if '<|start|>user' in suffix_plus_prefix or '<|start|>assistant' in suffix_plus_prefix:
+            start_index = suffix_plus_prefix.rindex('<|start|>')
+            prefix = suffix_plus_prefix[start_index:]
+        else:
+            prefix = suffix_plus_prefix[len(suffix):]
+
+        if strip_trailing_spaces:
+            prefix = prefix.rstrip(' ')
+
+        return prefix, suffix
+
+    def get_stopping_strings(self, renderer):
+        # This is the original, correct logic for dynamically creating stopping strings for linear templates.
+        prefix_bot, suffix_bot = self.get_generation_prefix_suffix(renderer, impersonate=False)
+        prefix_user, suffix_user = self.get_generation_prefix_suffix(renderer, impersonate=True)
+
+        stopping_strings = [
+            suffix_user + prefix_bot,
+            suffix_user + prefix_user,
+            suffix_bot + prefix_bot,
+            suffix_bot + prefix_user,
+        ]
+
+        # Attempt to find a single EOT token to use as a stop string
+        for item in stopping_strings:
+            item = item.strip()
+            if item.startswith("<") and ">" in item:
+                stopping_strings.append(item.split(">")[0] + ">")
+                break
+            elif item.startswith("[") and "]" in item:
+                stopping_strings.append(item.split("]")[0] + "]")
+                break
+
+        return stopping_strings
+
+    def modify_for_continue(self, prompt, renderer, impersonate=False):
+        suffix = self.get_generation_prefix_suffix(renderer, impersonate)[1]
+        if len(suffix) > 0:
+            return prompt[:-len(suffix)]
+        return prompt
+
+    def supports_impersonate(self):
+        return True
+
+
+class ChannelTemplateHandler(TemplateHandler):
+    """Handles channel-based templates"""
+
+    def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True):
+        """
+        Gets the string to add to the prompt to start a new generation.
+        """
+        dummy_message = [{'role': 'user', 'content': '...'}]
+        prompt_without_gen = renderer(messages=dummy_message, add_generation_prompt=False)
+        prompt_with_gen = renderer(messages=dummy_message, add_generation_prompt=True)
+        generation_prompt = prompt_with_gen[len(prompt_without_gen):]
+
+        if strip_trailing_spaces:
+            generation_prompt = generation_prompt.rstrip(' ')
+
+        return generation_prompt, ""
+
+    def get_stopping_strings(self, renderer):
+        return [
+            '<|return|>',
+            '<|start|>user',
+            '<|start|>developer',
+            '<|call|>'
+        ]
+
+    def modify_for_continue(self, prompt, renderer, impersonate=False):
+        return prompt
+
+    def supports_impersonate(self):
+        return False
+
+
+def create_template_handler(template_str):
+    """Factory function to create appropriate handler"""
+    if '<|channel|>' in template_str:
+        return ChannelTemplateHandler(template_str)
+    return LinearTemplateHandler(template_str)
+
+
 def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True):
     '''
     Given a Jinja template, reverse-engineers the prefix and the suffix for
@@ -270,6 +398,15 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
         messages.append({"role": "user", "content": user_input})
 
+    # Create template handler based on current template
+    template_str = state['instruction_template_str'] if state['mode'] == 'instruct' else chat_template_str
+    handler = create_template_handler(template_str)
+
+    # Check impersonate support early
+    if impersonate and not handler.supports_impersonate():
+        logger.warning("Impersonate not supported for channel-based templates")
+        return ""
+
     def make_prompt(messages):
         if state['mode'] == 'chat-instruct' and _continue:
             prompt = renderer(messages=messages[:-1])
@@ -287,10 +424,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
             command = replace_character_names(command, state['name1'], state['name2'])
 
             if _continue:
-                prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0]
+                prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0]
                 prefix += messages[-1]["content"]
             else:
-                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
+                prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate)[0]
                 if not impersonate:
                     prefix = apply_extensions('bot_prefix', prefix, state)
 
@@ -298,16 +435,14 @@ def generate_chat_prompt(user_input, state, **kwargs):
             outer_messages.append({"role": "assistant", "content": prefix})
 
             prompt = instruct_renderer(messages=outer_messages)
-            suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
+            suffix = handler.get_generation_prefix_suffix(instruct_renderer, impersonate=False)[1]
             if len(suffix) > 0:
                 prompt = prompt[:-len(suffix)]
         else:
             if _continue:
-                suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
-                if len(suffix) > 0:
-                    prompt = prompt[:-len(suffix)]
+                prompt = handler.modify_for_continue(prompt, renderer, impersonate)
             else:
-                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
+                prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate)[0]
                 if state['mode'] == 'chat' and not impersonate:
                     prefix = apply_extensions('bot_prefix', prefix, state)
 
@@ -433,31 +568,16 @@ def get_stopping_strings(state):
     if state['mode'] in ['instruct', 'chat-instruct']:
         template = jinja_env.from_string(state['instruction_template_str'])
         renderer = partial(template.render, add_generation_prompt=False)
-        renderers.append(renderer)
+        renderers.append((renderer, state['instruction_template_str']))
 
     if state['mode'] in ['chat', 'chat-instruct']:
         template = jinja_env.from_string(state['chat_template_str'])
         renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2'])
-        renderers.append(renderer)
+        renderers.append((renderer, state['chat_template_str']))
 
-    for renderer in renderers:
-        prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False)
-        prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True)
-
-        stopping_strings += [
-            suffix_user + prefix_bot,
-            suffix_user + prefix_user,
-            suffix_bot + prefix_bot,
-            suffix_bot + prefix_user,
-        ]
-
-    # Try to find the EOT token
-    for item in stopping_strings.copy():
-        item = item.strip()
-        if item.startswith("<") and ">" in item:
-            stopping_strings.append(item.split(">")[0] + ">")
-        elif item.startswith("[") and "]" in item:
-            stopping_strings.append(item.split("]")[0] + "]")
+    for renderer, template_str in renderers:
+        handler = create_template_handler(template_str)
+        stopping_strings += handler.get_stopping_strings(renderer)
 
     if 'stopping_strings' in state and isinstance(state['stopping_strings'], list):
         stopping_strings += state.pop('stopping_strings')
@@ -809,6 +929,16 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
 
 def impersonate_wrapper(textbox, state):
+    # Check template support first
+    template_str = state['chat_template_str']
+    handler = create_template_handler(template_str)
+
+    if not handler.supports_impersonate():
+        logger.warning("Impersonate not supported for channel-based templates")
+        static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        yield textbox, static_output
+        return
+
     text = textbox['text']
     static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 

From ecd16d6bf9f680ba5b25eb837bf61569dde81886 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 12:57:49 -0700
Subject: [PATCH 10/27] Automatically set skip_special_tokens to False for
 channel-based templates

---
 modules/chat.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index f929f653..46d24a6f 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -770,6 +770,16 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     output = apply_extensions('history', output)
     state = apply_extensions('state', state)
 
+    # Automatically set skip_special_tokens to False for channel-based templates
+    if state['mode'] in ['instruct', 'chat-instruct']:
+        template_str = state['instruction_template_str']
+    else:  # chat mode
+        template_str = state['chat_template_str']
+
+    handler = create_template_handler(template_str)
+    if isinstance(handler, ChannelTemplateHandler):
+        state['skip_special_tokens'] = False
+
     # Let the jinja2 template handle the BOS token
     if state['mode'] in ['instruct', 'chat-instruct']:
         state['add_bos_token'] = False

From 5c5a4dfc140d3e6558c97fc84c430faa2444ef28 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 13:03:18 -0700
Subject: [PATCH 11/27] Fix impersonate

---
 modules/chat.py | 43 ++++++++++++++-----------------------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 46d24a6f..043908c9 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -105,10 +105,6 @@ class TemplateHandler:
         """Modify prompt for continue mode"""
         return prompt
 
-    def supports_impersonate(self):
-        """Whether impersonate mode is supported"""
-        return False
-
 
 class LinearTemplateHandler(TemplateHandler):
     """Handles traditional linear templates"""
@@ -171,41 +167,41 @@ class LinearTemplateHandler(TemplateHandler):
             return prompt[:-len(suffix)]
         return prompt
 
-    def supports_impersonate(self):
-        return True
-
 
 class ChannelTemplateHandler(TemplateHandler):
     """Handles channel-based templates"""
 
     def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True):
         """
-        Gets the string to add to the prompt to start a new generation.
+        Gets the string to add to the prompt to start a new turn.
         """
-        dummy_message = [{'role': 'user', 'content': '...'}]
-        prompt_without_gen = renderer(messages=dummy_message, add_generation_prompt=False)
-        prompt_with_gen = renderer(messages=dummy_message, add_generation_prompt=True)
-        generation_prompt = prompt_with_gen[len(prompt_without_gen):]
+        if impersonate:
+            # For impersonate mode, we need the prefix for a user's turn.
+            prefix = "<|start|>user<|message|>"
+        else:
+            # For a normal reply, we need the prefix for the assistant's turn.
+            prefix = "<|start|>assistant"
 
         if strip_trailing_spaces:
-            generation_prompt = generation_prompt.rstrip(' ')
+            prefix = prefix.rstrip(' ')
 
-        return generation_prompt, ""
+        # The suffix is not needed for this template type's generation logic.
+        return prefix, ""
 
     def get_stopping_strings(self, renderer):
+        # Use specific tokens that unambiguously signal the end of a turn
+        # or the start of a different character's turn.
         return [
             '<|return|>',
             '<|start|>user',
             '<|start|>developer',
-            '<|call|>'
+            '<|call|>',
         ]
 
     def modify_for_continue(self, prompt, renderer, impersonate=False):
+        # Channels don't need suffix stripping for the continue logic to work.
         return prompt
 
-    def supports_impersonate(self):
-        return False
-
 
 def create_template_handler(template_str):
     """Factory function to create appropriate handler"""
@@ -402,11 +398,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
     template_str = state['instruction_template_str'] if state['mode'] == 'instruct' else chat_template_str
     handler = create_template_handler(template_str)
 
-    # Check impersonate support early
-    if impersonate and not handler.supports_impersonate():
-        logger.warning("Impersonate not supported for channel-based templates")
-        return ""
-
     def make_prompt(messages):
         if state['mode'] == 'chat-instruct' and _continue:
             prompt = renderer(messages=messages[:-1])
@@ -943,12 +934,6 @@ def impersonate_wrapper(textbox, state):
     template_str = state['chat_template_str']
     handler = create_template_handler(template_str)
 
-    if not handler.supports_impersonate():
-        logger.warning("Impersonate not supported for channel-based templates")
-        static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
-        yield textbox, static_output
-        return
-
     text = textbox['text']
     static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 

From 42e3a7a5ae7011d12987f23056220c506af69af6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 14:56:12 -0700
Subject: [PATCH 12/27] Update llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_cuda128.txt             | 4 ++--
 requirements/full/requirements_cuda128_noavx2.txt      | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 18 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 3a30a6c7..dd1e8d35 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index ae269bc9..b65f0b09 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 14871b4b..6e698654 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 49357939..84abd394 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index de33cdb8..2deefbc4 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 58496d9d..8c1baf04 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index c5322076..67a44432 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index 804ef934..9fe3c54b 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index 06d93d65..50602d8d 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index f9e5fb73..abdcfc16 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 53479a80..30d7d9e4 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index d7336d2f..a7c7808a 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 1edaa515..b1f66f56 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 04c9b283..76530338 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 3c3563d3..26235b83 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index cf0d7b11..cfa76310 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 9bd8a37c..2f8c401d 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index b8519553..e0650575 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 6bb8212731db0dddb00d10494e56223718401d4c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 15:06:22 -0700
Subject: [PATCH 13/27] Fix thinking block rendering for GPT-OSS

---
 modules/html_generator.py | 63 ++++++++++++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 6844c244..8777acf7 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -116,29 +116,56 @@ def extract_thinking_block(string):
     THINK_START_TAG = "&lt;think&gt;"
     THINK_END_TAG = "&lt;/think&gt;"
 
-    # Look for think tag
+    # Look for think tag first
     start_pos = string.find(THINK_START_TAG)
     end_pos = string.find(THINK_END_TAG)
 
-    # Return if neither tag is in string
-    if start_pos == -1 and end_pos == -1:
-        return None, string
+    # If think tags found, use existing logic
+    if start_pos != -1 or end_pos != -1:
+        # handle missing start or end tags
+        if start_pos == -1:
+            thought_start = 0
+        else:
+            thought_start = start_pos + len(THINK_START_TAG)
+        if end_pos == -1:
+            thought_end = len(string)
+            content_start = len(string)
+        else:
+            thought_end = end_pos
+            content_start = end_pos + len(THINK_END_TAG)
+        thinking_content = string[thought_start:thought_end]
+        remaining_content = string[content_start:]
+        return thinking_content, remaining_content
 
-    # handle missing start or end tags
-    if start_pos == -1:
-        thought_start = 0
-    else:
-        thought_start = start_pos + len(THINK_START_TAG)
-    if end_pos == -1:
-        thought_end = len(string)
-        content_start = len(string)
-    else:
-        thought_end = end_pos
-        content_start = end_pos + len(THINK_END_TAG)
+    # If think tags not found, try alternative format
+    ALT_START = "&lt;|channel|&gt;analysis&lt;|message|&gt;"
+    ALT_END = "&lt;|end|&gt;"
+    ALT_CONTENT_START = "&lt;|start|&gt;assistant&lt;|channel|&gt;final&lt;|message|&gt;"
 
-    thinking_content = string[thought_start:thought_end]
-    remaining_content = string[content_start:]
-    return thinking_content, remaining_content
+    alt_start_pos = string.find(ALT_START)
+    alt_end_pos = string.find(ALT_END)
+    alt_content_pos = string.find(ALT_CONTENT_START)
+
+    # Check if start tag or end tag is found
+    if alt_start_pos != -1 or alt_end_pos != -1:
+        if alt_start_pos == -1:
+            thought_start = 0
+        else:
+            thought_start = alt_start_pos + len(ALT_START)
+
+        if alt_end_pos == -1:
+            thought_end = len(string)
+            content_start = len(string)
+        else:
+            thought_end = alt_end_pos
+            content_start = alt_content_pos + len(ALT_CONTENT_START) if alt_content_pos != -1 else len(string)
+
+        thinking_content = string[thought_start:thought_end]
+        remaining_content = string[content_start:]
+        return thinking_content, remaining_content
+
+    # Return if neither format is found
+    return None, string
 
 
 @functools.lru_cache(maxsize=None)

From 498778b8ac85990158713c1925ca657d1fa135c8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 15:19:11 -0700
Subject: [PATCH 14/27] Add a new 'Reasoning effort' UI element

---
 extensions/openai/typing.py | 1 +
 modules/chat.py             | 3 ++-
 modules/loaders.py          | 5 +++++
 modules/shared.py           | 1 +
 modules/ui.py               | 2 ++
 modules/ui_chat.py          | 3 ++-
 6 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 6643ed16..6bd3749f 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -43,6 +43,7 @@ class GenerationOptions(BaseModel):
     ban_eos_token: bool = False
     add_bos_token: bool = True
     enable_thinking: bool = True
+    reasoning_effort: str = "medium"
     skip_special_tokens: bool = True
     static_cache: bool = False
     truncation_length: int = 0
diff --git a/modules/chat.py b/modules/chat.py
index 043908c9..dd923d67 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -306,7 +306,8 @@ def generate_chat_prompt(user_input, state, **kwargs):
         builtin_tools=None,
         tools=state['tools'] if 'tools' in state else None,
         tools_in_user_message=False,
-        add_generation_prompt=False
+        add_generation_prompt=False,
+        reasoning_effort=state.get('reasoning_effort', 'medium')
     )
 
     chat_renderer = partial(
diff --git a/modules/loaders.py b/modules/loaders.py
index f515aeca..7546bc5b 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -137,6 +137,7 @@ def transformers_samplers():
         'ban_eos_token',
         'add_bos_token',
         'enable_thinking',
+        'reasoning_effort',
         'skip_special_tokens',
         'static_cache',
         'seed',
@@ -189,6 +190,7 @@ loaders_samplers = {
         'ban_eos_token',
         'add_bos_token',
         'enable_thinking',
+        'reasoning_effort',
         'skip_special_tokens',
         'seed',
         'sampler_priority',
@@ -236,6 +238,7 @@ loaders_samplers = {
         'ban_eos_token',
         'add_bos_token',
         'enable_thinking',
+        'reasoning_effort',
         'skip_special_tokens',
         'seed',
         'sampler_priority',
@@ -275,6 +278,7 @@ loaders_samplers = {
         'ban_eos_token',
         'add_bos_token',
         'enable_thinking',
+        'reasoning_effort',
         'skip_special_tokens',
         'seed',
         'custom_token_bans',
@@ -308,6 +312,7 @@ loaders_samplers = {
         'ban_eos_token',
         'add_bos_token',
         'enable_thinking',
+        'reasoning_effort',
         'seed',
         'sampler_priority',
         'dry_sequence_breakers',
diff --git a/modules/shared.py b/modules/shared.py
index 5e3e11c0..ab5198d1 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -211,6 +211,7 @@ settings = {
     'ban_eos_token': False,
     'add_bos_token': True,
     'enable_thinking': True,
+    'reasoning_effort': 'medium',
     'skip_special_tokens': True,
     'stream': True,
     'static_cache': False,
diff --git a/modules/ui.py b/modules/ui.py
index 98acc038..e7805046 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -215,6 +215,7 @@ def list_interface_input_elements():
         'ban_eos_token',
         'add_bos_token',
         'enable_thinking',
+        'reasoning_effort',
         'skip_special_tokens',
         'stream',
         'static_cache',
@@ -482,6 +483,7 @@ def setup_auto_save():
         'ban_eos_token',
         'add_bos_token',
         'enable_thinking',
+        'reasoning_effort',
         'skip_special_tokens',
         'stream',
         'static_cache',
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 4dade176..1d85a398 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -78,7 +78,8 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
 
-                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle <think> mode.')
+                shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
+                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.')
                 shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search')
                 with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
                     shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)

From 0e42575c57b374e2b652b15cc2e03daec3170bc6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 15:36:20 -0700
Subject: [PATCH 15/27] Fix thinking block parsing for GPT-OSS under llama.cpp

---
 modules/html_generator.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 8777acf7..79237f7f 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -146,19 +146,23 @@ def extract_thinking_block(string):
     alt_end_pos = string.find(ALT_END)
     alt_content_pos = string.find(ALT_CONTENT_START)
 
-    # Check if start tag or end tag is found
     if alt_start_pos != -1 or alt_end_pos != -1:
         if alt_start_pos == -1:
             thought_start = 0
         else:
             thought_start = alt_start_pos + len(ALT_START)
 
+        # If no explicit end tag but content start exists, use content start as end
         if alt_end_pos == -1:
-            thought_end = len(string)
-            content_start = len(string)
+            if alt_content_pos != -1:
+                thought_end = alt_content_pos
+                content_start = alt_content_pos + len(ALT_CONTENT_START)
+            else:
+                thought_end = len(string)
+                content_start = len(string)
         else:
             thought_end = alt_end_pos
-            content_start = alt_content_pos + len(ALT_CONTENT_START) if alt_content_pos != -1 else len(string)
+            content_start = alt_content_pos + len(ALT_CONTENT_START) if alt_content_pos != -1 else alt_end_pos + len(ALT_END)
 
         thinking_content = string[thought_start:thought_end]
         remaining_content = string[content_start:]

From 7d98ca619558c9b77fa26130e3656b0bf8843341 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 15:43:44 -0700
Subject: [PATCH 16/27] Make web search functional with thinking models

---
 modules/chat.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index dd923d67..dbc0e6f6 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -307,7 +307,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
         tools=state['tools'] if 'tools' in state else None,
         tools_in_user_message=False,
         add_generation_prompt=False,
-        reasoning_effort=state.get('reasoning_effort', 'medium')
+        reasoning_effort=state['reasoning_effort'])
     )
 
     chat_renderer = partial(
@@ -730,9 +730,9 @@ def generate_search_query(user_message, state):
 
     # Use a minimal state for search query generation but keep the full history
     search_state = state.copy()
-    search_state['max_new_tokens'] = 64
-    search_state['auto_max_new_tokens'] = False
+    search_state['auto_max_new_tokens'] = True
     search_state['enable_thinking'] = False
+    search_state['reasoning_effort'] = 'low'
     search_state['start_with'] = ""
 
     # Generate the full prompt using existing history + augmented message
@@ -742,6 +742,12 @@ def generate_search_query(user_message, state):
     for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True):
         query = reply
 
+    # Check for thinking block delimiters and extract content after them
+    if "</think>" in query:
+        query = query.rsplit("</think>", 1)[1]
+    elif "<|start|>assistant<|channel|>final<|message|>" in query:
+        query = query.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1]
+
     # Strip and remove surrounding quotes if present
     query = query.strip()
     if len(query) >= 2 and query.startswith('"') and query.endswith('"'):

From 701048cf336946177ea216d3456e5f7cdd1cab85 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 15:51:24 -0700
Subject: [PATCH 17/27] Try to avoid breaking jinja2 parsing for older models

---
 modules/models_settings.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 8ed7f953..3c068df0 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -91,6 +91,7 @@ def get_model_metadata(model):
             template = template.replace('bos_token', "'{}'".format(bos_token))
 
             template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
+            template = re.sub(r'raise_exception\([^)]*\)', "''", template)
             template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
             model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
             model_settings['instruction_template_str'] = template
@@ -150,6 +151,7 @@ def get_model_metadata(model):
                     template = template.replace(k, "'{}'".format(value))
 
             template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
+            template = re.sub(r'raise_exception\([^)]*\)', "''", template)
             template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
             model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
             model_settings['instruction_template_str'] = template

From e5b8d4d072f74281071a9ad911bede662d61767e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 15:52:56 -0700
Subject: [PATCH 18/27] Fix a typo

---
 modules/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index dbc0e6f6..1f4e2af0 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -307,7 +307,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
         tools=state['tools'] if 'tools' in state else None,
         tools_in_user_message=False,
         add_generation_prompt=False,
-        reasoning_effort=state['reasoning_effort'])
+        reasoning_effort=state['reasoning_effort']
     )
 
     chat_renderer = partial(

From 80f6abb07e44cb70d65a0d43fe9676a02880eb2c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 16:01:19 -0700
Subject: [PATCH 19/27] Begin fixing 'Continue' with GPT-OSS

---
 modules/chat.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 1f4e2af0..b23340aa 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -165,6 +165,7 @@ class LinearTemplateHandler(TemplateHandler):
         suffix = self.get_generation_prefix_suffix(renderer, impersonate)[1]
         if len(suffix) > 0:
             return prompt[:-len(suffix)]
+
         return prompt
 
 
@@ -199,7 +200,10 @@ class ChannelTemplateHandler(TemplateHandler):
         ]
 
     def modify_for_continue(self, prompt, renderer, impersonate=False):
-        # Channels don't need suffix stripping for the continue logic to work.
+        suffix = '<|return|>'
+        if prompt.endswith(suffix):
+            return prompt[:-len(suffix)]
+
         return prompt
 
 

From 20adc3c96737e35b96f6b1d557a63b1d2c75a825 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 16:58:45 -0700
Subject: [PATCH 20/27] Start over new template handling (to avoid
 overcomplicating)

---
 modules/chat.py | 192 +++++++-----------------------------------------
 1 file changed, 28 insertions(+), 164 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index b23340aa..82760cc8 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -86,134 +86,6 @@ yaml.add_representer(str, str_presenter)
 yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
 
 
-# Template Handler Classes
-class TemplateHandler:
-    """Base class for handling different template types"""
-
-    def __init__(self, template_str):
-        self.template_str = template_str
-
-    def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True):
-        """Get prefix/suffix for generation"""
-        return "", ""
-
-    def get_stopping_strings(self, renderer):
-        """Get stopping strings for this template type"""
-        return []
-
-    def modify_for_continue(self, prompt, renderer, impersonate=False):
-        """Modify prompt for continue mode"""
-        return prompt
-
-
-class LinearTemplateHandler(TemplateHandler):
-    """Handles traditional linear templates"""
-
-    def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True):
-        # This is the original, complex logic for deriving prefix/suffix for old templates.
-        if impersonate:
-            messages = [
-                {"role": "user", "content": "<<|user-message-1|>>"},
-                {"role": "user", "content": "<<|user-message-2|>>"},
-            ]
-        else:
-            messages = [
-                {"role": "assistant", "content": "<<|user-message-1|>>"},
-                {"role": "assistant", "content": "<<|user-message-2|>>"},
-            ]
-
-        prompt = renderer(messages=messages)
-        suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0]
-        suffix = prompt.split("<<|user-message-2|>>")[1]
-
-        if '<|start|>user' in suffix_plus_prefix or '<|start|>assistant' in suffix_plus_prefix:
-            start_index = suffix_plus_prefix.rindex('<|start|>')
-            prefix = suffix_plus_prefix[start_index:]
-        else:
-            prefix = suffix_plus_prefix[len(suffix):]
-
-        if strip_trailing_spaces:
-            prefix = prefix.rstrip(' ')
-
-        return prefix, suffix
-
-    def get_stopping_strings(self, renderer):
-        # This is the original, correct logic for dynamically creating stopping strings for linear templates.
-        prefix_bot, suffix_bot = self.get_generation_prefix_suffix(renderer, impersonate=False)
-        prefix_user, suffix_user = self.get_generation_prefix_suffix(renderer, impersonate=True)
-
-        stopping_strings = [
-            suffix_user + prefix_bot,
-            suffix_user + prefix_user,
-            suffix_bot + prefix_bot,
-            suffix_bot + prefix_user,
-        ]
-
-        # Attempt to find a single EOT token to use as a stop string
-        for item in stopping_strings:
-            item = item.strip()
-            if item.startswith("<") and ">" in item:
-                stopping_strings.append(item.split(">")[0] + ">")
-                break
-            elif item.startswith("[") and "]" in item:
-                stopping_strings.append(item.split("]")[0] + "]")
-                break
-
-        return stopping_strings
-
-    def modify_for_continue(self, prompt, renderer, impersonate=False):
-        suffix = self.get_generation_prefix_suffix(renderer, impersonate)[1]
-        if len(suffix) > 0:
-            return prompt[:-len(suffix)]
-
-        return prompt
-
-
-class ChannelTemplateHandler(TemplateHandler):
-    """Handles channel-based templates"""
-
-    def get_generation_prefix_suffix(self, renderer, impersonate=False, strip_trailing_spaces=True):
-        """
-        Gets the string to add to the prompt to start a new turn.
-        """
-        if impersonate:
-            # For impersonate mode, we need the prefix for a user's turn.
-            prefix = "<|start|>user<|message|>"
-        else:
-            # For a normal reply, we need the prefix for the assistant's turn.
-            prefix = "<|start|>assistant"
-
-        if strip_trailing_spaces:
-            prefix = prefix.rstrip(' ')
-
-        # The suffix is not needed for this template type's generation logic.
-        return prefix, ""
-
-    def get_stopping_strings(self, renderer):
-        # Use specific tokens that unambiguously signal the end of a turn
-        # or the start of a different character's turn.
-        return [
-            '<|return|>',
-            '<|start|>user',
-            '<|start|>developer',
-            '<|call|>',
-        ]
-
-    def modify_for_continue(self, prompt, renderer, impersonate=False):
-        suffix = '<|return|>'
-        if prompt.endswith(suffix):
-            return prompt[:-len(suffix)]
-
-        return prompt
-
-
-def create_template_handler(template_str):
-    """Factory function to create appropriate handler"""
-    if '<|channel|>' in template_str:
-        return ChannelTemplateHandler(template_str)
-    return LinearTemplateHandler(template_str)
-
-
 def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True):
     '''
     Given a Jinja template, reverse-engineers the prefix and the suffix for
@@ -236,14 +108,7 @@ def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=Tru
 
     suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0]
     suffix = prompt.split("<<|user-message-2|>>")[1]
-
-    # Remove the message suffix. The first case handles the GPT-OSS model
-    # in a way that is likely to not interfere with previous models.
-    if '<|start|>user' in suffix_plus_prefix or '<|start|>assistant' in suffix_plus_prefix:
-        start_index = suffix_plus_prefix.rindex('<|start|>')
-        prefix = suffix_plus_prefix[start_index:]
-    else:
-        prefix = suffix_plus_prefix[len(suffix):]
+    prefix = suffix_plus_prefix[len(suffix):]
 
     if strip_trailing_spaces:
         prefix = prefix.rstrip(' ')
@@ -399,10 +264,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
         messages.append({"role": "user", "content": user_input})
 
-    # Create template handler based on current template
-    template_str = state['instruction_template_str'] if state['mode'] == 'instruct' else chat_template_str
-    handler = create_template_handler(template_str)
-
     def make_prompt(messages):
         if state['mode'] == 'chat-instruct' and _continue:
             prompt = renderer(messages=messages[:-1])
@@ -420,10 +281,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
             command = replace_character_names(command, state['name1'], state['name2'])
 
             if _continue:
-                prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0]
+                prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0]
                 prefix += messages[-1]["content"]
             else:
-                prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate)[0]
+                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
                 if not impersonate:
                     prefix = apply_extensions('bot_prefix', prefix, state)
 
@@ -431,14 +292,16 @@ def generate_chat_prompt(user_input, state, **kwargs):
             outer_messages.append({"role": "assistant", "content": prefix})
 
             prompt = instruct_renderer(messages=outer_messages)
-            suffix = handler.get_generation_prefix_suffix(instruct_renderer, impersonate=False)[1]
+            suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
             if len(suffix) > 0:
                 prompt = prompt[:-len(suffix)]
         else:
             if _continue:
-                prompt = handler.modify_for_continue(prompt, renderer, impersonate)
+                suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
+                if len(suffix) > 0:
+                    prompt = prompt[:-len(suffix)]
             else:
-                prefix = handler.get_generation_prefix_suffix(renderer, impersonate=impersonate)[0]
+                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
                 if state['mode'] == 'chat' and not impersonate:
                     prefix = apply_extensions('bot_prefix', prefix, state)
 
@@ -564,16 +427,31 @@ def get_stopping_strings(state):
     if state['mode'] in ['instruct', 'chat-instruct']:
         template = jinja_env.from_string(state['instruction_template_str'])
         renderer = partial(template.render, add_generation_prompt=False)
-        renderers.append((renderer, state['instruction_template_str']))
+        renderers.append(renderer)
 
     if state['mode'] in ['chat', 'chat-instruct']:
         template = jinja_env.from_string(state['chat_template_str'])
         renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2'])
-        renderers.append((renderer, state['chat_template_str']))
+        renderers.append(renderer)
 
-    for renderer, template_str in renderers:
-        handler = create_template_handler(template_str)
-        stopping_strings += handler.get_stopping_strings(renderer)
+    for renderer in renderers:
+        prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False)
+        prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True)
+
+        stopping_strings += [
+            suffix_user + prefix_bot,
+            suffix_user + prefix_user,
+            suffix_bot + prefix_bot,
+            suffix_bot + prefix_user,
+        ]
+
+    # Try to find the EOT token
+    for item in stopping_strings.copy():
+        item = item.strip()
+        if item.startswith("<") and ">" in item:
+            stopping_strings.append(item.split(">")[0] + ">")
+        elif item.startswith("[") and "]" in item:
+            stopping_strings.append(item.split("]")[0] + "]")
 
     if 'stopping_strings' in state and isinstance(state['stopping_strings'], list):
         stopping_strings += state.pop('stopping_strings')
@@ -772,16 +650,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     output = apply_extensions('history', output)
     state = apply_extensions('state', state)
 
-    # Automatically set skip_special_tokens to False for channel-based templates
-    if state['mode'] in ['instruct', 'chat-instruct']:
-        template_str = state['instruction_template_str']
-    else:  # chat mode
-        template_str = state['chat_template_str']
-
-    handler = create_template_handler(template_str)
-    if isinstance(handler, ChannelTemplateHandler):
-        state['skip_special_tokens'] = False
-
     # Let the jinja2 template handle the BOS token
     if state['mode'] in ['instruct', 'chat-instruct']:
         state['add_bos_token'] = False
@@ -941,10 +809,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
 
 def impersonate_wrapper(textbox, state):
-    # Check template support first
-    template_str = state['chat_template_str']
-    handler = create_template_handler(template_str)
-
     text = textbox['text']
     static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 

From bfbbfc2361e26b03e5af9a26434391be9fd257f1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 17:33:01 -0700
Subject: [PATCH 21/27] Ignore add_generation_prompt in GPT-OSS

---
 modules/models_settings.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 3c068df0..e35e1c04 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -93,6 +93,7 @@ def get_model_metadata(model):
             template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
             template = re.sub(r'raise_exception\([^)]*\)', "''", template)
             template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
+            template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template)  # Handle GPT-OSS
             model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
             model_settings['instruction_template_str'] = template
 
@@ -153,6 +154,7 @@ def get_model_metadata(model):
             template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
             template = re.sub(r'raise_exception\([^)]*\)', "''", template)
             template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
+            template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template)  # Handle GPT-OSS
             model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
             model_settings['instruction_template_str'] = template
 

From fbea21a1f13186740012c55aa7877a5aeda89c2f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 17:33:27 -0700
Subject: [PATCH 22/27] Only use enable_thinking if the template supports it

---
 modules/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 82760cc8..e7fd86f9 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -307,7 +307,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
                 prompt += prefix
 
-        if state['mode'] == 'instruct' and not any((_continue, impersonate, state['enable_thinking'])):
+        if state['mode'] == 'instruct' and 'enable_thinking' in state['instruction_template_str'] and not any((_continue, impersonate, state['enable_thinking'])):
             prompt += get_thinking_suppression_string(instruction_template)
 
         return prompt

From 7c82d65a9d071342cc501246760f8a875e5097a7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 5 Aug 2025 18:05:09 -0700
Subject: [PATCH 23/27] Handle GPT-OSS as a special template case

---
 modules/chat.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index e7fd86f9..66f89c70 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -302,6 +302,13 @@ def generate_chat_prompt(user_input, state, **kwargs):
                     prompt = prompt[:-len(suffix)]
             else:
                 prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
+
+                # Handle GPT-OSS as a special case
+                if '<|channel|>final<|message|>' in state['instruction_template_str']:
+                    prefix = prefix.rstrip("<|channel|>final<|message|>")
+                    if impersonate:
+                        prefix += "<|message|>"
+
                 if state['mode'] == 'chat' and not impersonate:
                     prefix = apply_extensions('bot_prefix', prefix, state)
 
@@ -460,6 +467,12 @@ def get_stopping_strings(state):
     result = [item for item in stopping_strings if not any(item.startswith(other) and item != other for other in stopping_strings)]
     result = list(set(result))
 
+    # Handle GPT-OSS as a special case
+    if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
+        result.remove("<|end|>")
+        result.append("<|result|>")
+        result = list(set(result))
+
     if shared.args.verbose:
         logger.info("STOPPING_STRINGS=")
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(result)
@@ -650,6 +663,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     output = apply_extensions('history', output)
     state = apply_extensions('state', state)
 
+    # Handle GPT-OSS as a special case
+    if '<|channel|>final<|message|>' in state['instruction_template_str']:
+        state['skip_special_tokens'] = False
+
     # Let the jinja2 template handle the BOS token
     if state['mode'] in ['instruct', 'chat-instruct']:
         state['add_bos_token'] = False

From 6ce4b353c49a1b9b86cf842a1d30ec2198f5d9b7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 6 Aug 2025 06:42:45 -0700
Subject: [PATCH 24/27] Fix the GPT-OSS template

---
 modules/chat.py | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 66f89c70..e07dfd1c 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -211,7 +211,39 @@ def generate_chat_prompt(user_input, state, **kwargs):
             messages.insert(insert_pos, {"role": "tool", "content": tool_msg})
 
         if assistant_msg:
-            messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
+            # Handle GPT-OSS as a special case
+            if '<|channel|>analysis<|message|>' in assistant_msg or '<|channel|>final<|message|>' in assistant_msg:
+
+                thinking_content = ""
+                final_content = ""
+
+                # Extract analysis content if present
+                if '<|channel|>analysis<|message|>' in assistant_msg:
+                    analysis_start = assistant_msg.find('<|channel|>analysis<|message|>') + len('<|channel|>analysis<|message|>')
+                    if '<|start|>assistant<|channel|>final<|message|>' in assistant_msg:
+                        analysis_end = assistant_msg.find('<|start|>assistant<|channel|>final<|message|>')
+                    else:
+                        analysis_end = len(assistant_msg)
+
+                    thinking_content = assistant_msg[analysis_start:analysis_end].strip()
+
+                # Extract final content if present
+                if '<|start|>assistant<|channel|>final<|message|>' in assistant_msg:
+                    final_start = assistant_msg.find('<|start|>assistant<|channel|>final<|message|>') + len('<|start|>assistant<|channel|>final<|message|>')
+                    final_content = assistant_msg[final_start:].strip()
+                elif '<|channel|>final<|message|>' in assistant_msg:
+                    final_start = assistant_msg.find('<|channel|>final<|message|>') + len('<|channel|>final<|message|>')
+                    final_content = assistant_msg[final_start:].strip()
+
+                # Insert as structured message
+                msg_dict = {"role": "assistant", "content": final_content}
+                if thinking_content:
+                    msg_dict["thinking"] = thinking_content
+
+                messages.insert(insert_pos, msg_dict)
+
+            else:
+                messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
 
         if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
             # Check for user message attachments in metadata
@@ -305,7 +337,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
                 # Handle GPT-OSS as a special case
                 if '<|channel|>final<|message|>' in state['instruction_template_str']:
-                    prefix = prefix.rstrip("<|channel|>final<|message|>")
+                    if prefix.endswith("<|channel|>final<|message|>"):
+                        prefix = prefix[:-len("<|channel|>final<|message|>")]
+
                     if impersonate:
                         prefix += "<|message|>"
 

From 0c1403f2c72133e1ff63154d21f37954a2e1c343 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 6 Aug 2025 08:05:37 -0700
Subject: [PATCH 25/27] Handle GPT-OSS as a special case when continuing

---
 modules/chat.py | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index e07dfd1c..64588b9d 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -237,7 +237,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
                 # Insert as structured message
                 msg_dict = {"role": "assistant", "content": final_content}
-                if thinking_content:
+                if '<|channel|>analysis<|message|>' in assistant_msg:
                     msg_dict["thinking"] = thinking_content
 
                 messages.insert(insert_pos, msg_dict)
@@ -328,25 +328,42 @@ def generate_chat_prompt(user_input, state, **kwargs):
             if len(suffix) > 0:
                 prompt = prompt[:-len(suffix)]
         else:
-            if _continue:
-                suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
-                if len(suffix) > 0:
-                    prompt = prompt[:-len(suffix)]
+            # Handle GPT-OSS as a special case when continuing
+            if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']:
+                # This prevents the template from stripping the analysis block of the message being continued.
+
+                last_message_to_continue = messages[-1]
+                prompt = renderer(messages=messages[:-1])
+
+                assistant_reply_so_far = ""
+                if 'thinking' in last_message_to_continue:
+                    assistant_reply_so_far += f"<|start|>assistant<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>"
+
+                assistant_reply_so_far += f"<|start|>assistant<|channel|>final<|message|>{last_message_to_continue.get('content', '')}"
+
+                prompt += assistant_reply_so_far
+
             else:
-                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
+                prompt = renderer(messages=messages)
+                if _continue:
+                    suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
+                    if len(suffix) > 0:
+                        prompt = prompt[:-len(suffix)]
+                else:
+                    prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
 
-                # Handle GPT-OSS as a special case
-                if '<|channel|>final<|message|>' in state['instruction_template_str']:
-                    if prefix.endswith("<|channel|>final<|message|>"):
-                        prefix = prefix[:-len("<|channel|>final<|message|>")]
+                    # Handle GPT-OSS as a special case when not continuing
+                    if '<|channel|>final<|message|>' in state['instruction_template_str']:
+                        if prefix.endswith("<|channel|>final<|message|>"):
+                            prefix = prefix[:-len("<|channel|>final<|message|>")]
 
-                    if impersonate:
-                        prefix += "<|message|>"
+                        if impersonate:
+                            prefix += "<|message|>"
 
-                if state['mode'] == 'chat' and not impersonate:
-                    prefix = apply_extensions('bot_prefix', prefix, state)
+                    if state['mode'] == 'chat' and not impersonate:
+                        prefix = apply_extensions('bot_prefix', prefix, state)
 
-                prompt += prefix
+                    prompt += prefix
 
         if state['mode'] == 'instruct' and 'enable_thinking' in state['instruction_template_str'] and not any((_continue, impersonate, state['enable_thinking'])):
             prompt += get_thinking_suppression_string(instruction_template)

From 3e24f455c8cca90310d5a3f9db31ed2007520db3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 6 Aug 2025 10:18:42 -0700
Subject: [PATCH 26/27] Fix continue for GPT-OSS (hopefully the final fix)

---
 modules/chat.py | 54 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 64588b9d..1ab91b5e 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -219,21 +219,39 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
                 # Extract analysis content if present
                 if '<|channel|>analysis<|message|>' in assistant_msg:
-                    analysis_start = assistant_msg.find('<|channel|>analysis<|message|>') + len('<|channel|>analysis<|message|>')
-                    if '<|start|>assistant<|channel|>final<|message|>' in assistant_msg:
-                        analysis_end = assistant_msg.find('<|start|>assistant<|channel|>final<|message|>')
-                    else:
-                        analysis_end = len(assistant_msg)
+                    # Split the message by the analysis tag to isolate the content that follows
+                    parts = assistant_msg.split('<|channel|>analysis<|message|>', 1)
+                    if len(parts) > 1:
+                        # The content is everything after the tag
+                        potential_content = parts[1]
 
-                    thinking_content = assistant_msg[analysis_start:analysis_end].strip()
+                        # Now, find the end of this content block
+                        analysis_end_tag = '<|end|>'
+                        if analysis_end_tag in potential_content:
+                            thinking_content = potential_content.split(analysis_end_tag, 1)[0].strip()
+                        else:
+                            # Fallback: if no <|end|> tag, stop at the start of the final channel if it exists
+                            final_channel_tag = '<|channel|>final<|message|>'
+                            if final_channel_tag in potential_content:
+                                thinking_content = potential_content.split(final_channel_tag, 1)[0].strip()
+                            else:
+                                thinking_content = potential_content.strip()
 
                 # Extract final content if present
-                if '<|start|>assistant<|channel|>final<|message|>' in assistant_msg:
-                    final_start = assistant_msg.find('<|start|>assistant<|channel|>final<|message|>') + len('<|start|>assistant<|channel|>final<|message|>')
-                    final_content = assistant_msg[final_start:].strip()
-                elif '<|channel|>final<|message|>' in assistant_msg:
-                    final_start = assistant_msg.find('<|channel|>final<|message|>') + len('<|channel|>final<|message|>')
-                    final_content = assistant_msg[final_start:].strip()
+                final_tag_to_find = '<|channel|>final<|message|>'
+                if final_tag_to_find in assistant_msg:
+                    # Split the message by the final tag to isolate the content that follows
+                    parts = assistant_msg.split(final_tag_to_find, 1)
+                    if len(parts) > 1:
+                        # The content is everything after the tag
+                        potential_content = parts[1]
+
+                        # Now, find the end of this content block
+                        final_end_tag = '<|end|>'
+                        if final_end_tag in potential_content:
+                            final_content = potential_content.split(final_end_tag, 1)[0].strip()
+                        else:
+                            final_content = potential_content.strip()
 
                 # Insert as structured message
                 msg_dict = {"role": "assistant", "content": final_content}
@@ -330,16 +348,16 @@ def generate_chat_prompt(user_input, state, **kwargs):
         else:
             # Handle GPT-OSS as a special case when continuing
             if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']:
-                # This prevents the template from stripping the analysis block of the message being continued.
-
                 last_message_to_continue = messages[-1]
                 prompt = renderer(messages=messages[:-1])
 
-                assistant_reply_so_far = ""
-                if 'thinking' in last_message_to_continue:
-                    assistant_reply_so_far += f"<|start|>assistant<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>"
+                # Start the assistant turn wrapper
+                assistant_reply_so_far = "<|start|>assistant"
 
-                assistant_reply_so_far += f"<|start|>assistant<|channel|>final<|message|>{last_message_to_continue.get('content', '')}"
+                if 'thinking' in last_message_to_continue:
+                    assistant_reply_so_far += f"<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>"
+
+                assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}"
 
                 prompt += assistant_reply_so_far
 

From f1147c992618ee17a7f5a37331d99d00ad02fd79 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 6 Aug 2025 19:32:36 -0700
Subject: [PATCH 27/27] Update llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_cuda128.txt             | 4 ++--
 requirements/full/requirements_cuda128_noavx2.txt      | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 18 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index dd1e8d35..f17cae8a 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index b65f0b09..51f4571f 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 6e698654..37021c77 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 84abd394..f54ae191 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 2deefbc4..e495455b 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 8c1baf04..72847534 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 67a44432..ed641a24 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt
index 9fe3c54b..d7fe735b 100644
--- a/requirements/full/requirements_cuda128.txt
+++ b/requirements/full/requirements_cuda128.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt
index 50602d8d..cb71f74b 100644
--- a/requirements/full/requirements_cuda128_noavx2.txt
+++ b/requirements/full/requirements_cuda128_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index abdcfc16..d6bed576 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 30d7d9e4..1f17dc50 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index a7c7808a..82254842 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index b1f66f56..986a3d49 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 76530338..833e923b 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 26235b83..6a894d49 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index cfa76310..0afb19c2 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 2f8c401d..a404f50c 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index e0650575..75176656 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.32.0/llama_cpp_binaries-0.32.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"