From 7b887247115b40825854c31f094703af399bd6d2 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Wed, 1 Jan 2025 18:33:38 -0300
Subject: [PATCH 01/35] Make responses start faster by removing unnecessary
 cleanup calls (#6625)

---
 modules/callbacks.py       | 15 +--------------
 modules/models.py          |  1 +
 modules/text_generation.py |  5 +++--
 3 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index 2b039ef1..2c04cc53 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -65,7 +65,6 @@ class Iteratorize:
                 traceback.print_exc()
                 pass
 
-            clear_torch_cache()
             self.q.put(self.sentinel)
             if self.c_callback:
                 self.c_callback(ret)
@@ -84,22 +83,10 @@ class Iteratorize:
             return obj
 
     def __del__(self):
-        clear_torch_cache()
+        pass
 
     def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.stop_now = True
-        clear_torch_cache()
-
-
-def clear_torch_cache():
-    gc.collect()
-    if not shared.args.cpu:
-        if is_torch_xpu_available():
-            torch.xpu.empty_cache()
-        elif is_torch_npu_available():
-            torch.npu.empty_cache()
-        else:
-            torch.cuda.empty_cache()
diff --git a/modules/models.py b/modules/models.py
index 3e8ae3f2..7a52c07c 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -90,6 +90,7 @@ def load_model(model_name, loader=None):
                 raise ValueError
 
     shared.args.loader = loader
+    clear_torch_cache()
     output = load_func_map[loader](model_name)
     if type(output) is tuple:
         model, tokenizer = output
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 86245098..c999fa81 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -79,7 +79,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
             all_stop_strings += st
 
     shared.stop_everything = False
-    clear_torch_cache()
     seed = set_manual_seed(state['seed'])
     last_update = -1
     reply = ''
@@ -288,6 +287,9 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
 
 
 def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
+    if shared.args.loader == 'Transformers':
+        clear_torch_cache()
+
     generate_params = {}
     for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'smoothing_curve', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_sequence_breakers', 'xtc_threshold', 'xtc_probability']:
         if k in state:
@@ -393,7 +395,6 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
 
             def generate_with_callback(callback=None, *args, **kwargs):
                 kwargs['stopping_criteria'].append(Stream(callback_func=callback))
-                clear_torch_cache()
                 with torch.no_grad():
                     shared.model.generate(**kwargs)
 

From 725639118a84fc78a1bdb9c49538a941899eee58 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Jan 2025 13:53:50 -0800
Subject: [PATCH 02/35] UI: Use a tab length of 2 for lists (rather than 4)

---
 modules/html_generator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 40a56731..8160f8b6 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -195,7 +195,7 @@ def convert_to_markdown(string):
         result = re.sub(list_item_pattern, r'\g<1> ' + delete_str, result)
 
         # Convert to HTML using markdown
-        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'], tab_length=2)
 
         # Remove the delete string from the HTML output
         pos = html_output.rfind(delete_str)
@@ -203,7 +203,7 @@ def convert_to_markdown(string):
             html_output = html_output[:pos] + html_output[pos + len(delete_str):]
     else:
         # Convert to HTML using markdown
-        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'], tab_length=2)
 
     # Unescape code blocks
     pattern = re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL)

From 9163951f3aa65956ffcc7518d4a9c6ccca353b0b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Jan 2025 17:49:57 -0800
Subject: [PATCH 03/35] UI: reduce the CPU usage during text streaming

---
 js/main.js | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/js/main.js b/js/main.js
index a8018175..76a0184b 100644
--- a/js/main.js
+++ b/js/main.js
@@ -446,25 +446,33 @@ function toggleBigPicture() {
 //------------------------------------------------
 // Handle the chat input box growth
 //------------------------------------------------
-let currentChatInputHeight = 0;
+
+// Variables to store current dimensions
+let currentChatInputHeight = chatInput.clientHeight;
+
+// Cache DOM elements
+const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
+const chatInput = document.querySelector("#chat-input textarea");
 
 // Update chat layout based on chat and input dimensions
 function updateCssProperties() {
-  const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
-  const chatInputHeight = document.querySelector("#chat-input textarea").clientHeight;
+  const chatInputHeight = chatInput.clientHeight;
 
   // Check if the chat container is visible
   if (chatContainer.clientHeight > 0) {
-    const newChatHeight = `${chatContainer.parentNode.clientHeight - chatInputHeight + 40 - 100 - 20}px`;
+    const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
+    const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
+
     document.documentElement.style.setProperty("--chat-height", newChatHeight);
     document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
 
     // Adjust scrollTop based on input height change
     if (chatInputHeight !== currentChatInputHeight) {
-      if (!isScrolled && chatInputHeight < currentChatInputHeight) {
+      const deltaHeight = chatInputHeight - currentChatInputHeight;
+      if (!isScrolled && deltaHeight < 0) {
         chatContainer.scrollTop = chatContainer.scrollHeight;
       } else {
-        chatContainer.scrollTop += chatInputHeight - currentChatInputHeight;
+        chatContainer.scrollTop += deltaHeight;
       }
 
       currentChatInputHeight = chatInputHeight;

From f011787a835b8eb2d3cf354cec4b76a8109defbc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Jan 2025 17:55:18 -0800
Subject: [PATCH 04/35] UI: make codeblocks scroll horizontally on overflow

---
 css/main.css | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index 314b36e0..a08350ad 100644
--- a/css/main.css
+++ b/css/main.css
@@ -538,8 +538,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .message-body pre > code {
-    white-space: pre-wrap !important;
-    word-wrap: break-word !important;
+    white-space: pre !important;
+    overflow-x: auto !important;
     border: 1px solid #666;
     border-radius: 5px;
     font-size: 82%;

From 979e1f1bd68ed9e44b9af0633017f9ecc456bbcd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 1 Jan 2025 17:57:09 -0800
Subject: [PATCH 05/35] Fix a bug after
 9163951f3aa65956ffcc7518d4a9c6ccca353b0b

---
 js/main.js | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/main.js b/js/main.js
index 76a0184b..efb84238 100644
--- a/js/main.js
+++ b/js/main.js
@@ -447,13 +447,13 @@ function toggleBigPicture() {
 // Handle the chat input box growth
 //------------------------------------------------
 
-// Variables to store current dimensions
-let currentChatInputHeight = chatInput.clientHeight;
-
 // Cache DOM elements
 const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
 const chatInput = document.querySelector("#chat-input textarea");
 
+// Variables to store current dimensions
+let currentChatInputHeight = chatInput.clientHeight;
+
 // Update chat layout based on chat and input dimensions
 function updateCssProperties() {
   const chatInputHeight = chatInput.clientHeight;

From 13c033c745b54ce3b35c805ca0ff12f8f93bfb5d Mon Sep 17 00:00:00 2001
From: Petr Korolev <sky4winder+github@gmail.com>
Date: Thu, 2 Jan 2025 06:06:11 +0300
Subject: [PATCH 06/35] Fix CUDA error on MPS backend during API request
 (#6572)

---------

Co-authored-by: oobabooga <oobabooga4@gmail.com>
---
 modules/LoRA.py            | 14 +++---------
 modules/logits.py          | 25 +++++++++------------
 modules/models.py          | 46 ++++++++++++++++++++++++--------------
 modules/sampler_hijack.py  | 15 +++++++------
 modules/text_generation.py | 28 ++++++++++-------------
 5 files changed, 63 insertions(+), 65 deletions(-)

diff --git a/modules/LoRA.py b/modules/LoRA.py
index 4fd144ba..e1ad01d7 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -1,11 +1,8 @@
 from pathlib import Path
 
-import torch
-from transformers import is_torch_xpu_available
-
 import modules.shared as shared
 from modules.logging_colors import logger
-from modules.models import reload_model
+from modules.models import get_device, reload_model
 
 
 def add_lora_to_model(lora_names):
@@ -132,14 +129,9 @@ def add_lora_transformers(lora_names):
         if not shared.args.load_in_8bit and not shared.args.cpu:
             shared.model.half()
             if not hasattr(shared.model, "hf_device_map"):
-                if torch.backends.mps.is_available():
-                    device = torch.device('mps')
+                device = get_device()
+                if device:
                     shared.model = shared.model.to(device)
-                elif is_torch_xpu_available():
-                    device = torch.device("xpu:0")
-                    shared.model = shared.model.to(device)
-                else:
-                    shared.model = shared.model.cuda()
 
     shared.lora_names = lora_names
 
diff --git a/modules/logits.py b/modules/logits.py
index 73cabb41..f8a1e80c 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -2,11 +2,10 @@ import time
 import traceback
 
 import torch
-from transformers import is_torch_npu_available, is_torch_xpu_available
 
 from modules import models, sampler_hijack, shared
 from modules.logging_colors import logger
-from modules.models import load_model
+from modules.models import get_device, load_model
 from modules.text_generation import generate_reply
 
 global_scores = None
@@ -57,23 +56,21 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
         scores = sampler_hijack.global_scores[-1]
     else:
         if is_non_hf_exllamav2:
-            if is_torch_xpu_available():
-                tokens = shared.tokenizer.encode(prompt).to("xpu:0")
-            elif is_torch_npu_available():
-                tokens = shared.tokenizer.encode(prompt).to("npu:0")
-            else:
-                tokens = shared.tokenizer.encode(prompt).cuda()
+            device = get_device()
+            tokens = shared.tokenizer.encode(prompt)
+            if device:
+                tokens = tokens.to(device)
+
             scores = shared.model.get_logits(tokens)[-1][-1]
         elif is_non_hf_llamacpp:
             tokens = shared.tokenizer.encode(prompt)
             scores = shared.model.get_logits(tokens)[-1][-1]
         else:
-            if is_torch_xpu_available():
-                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0")
-            elif is_torch_npu_available():
-                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("npu:0")
-            else:
-                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
+            device = get_device()
+            tokens = shared.tokenizer.encode(prompt, return_tensors='pt')
+            if device:
+                tokens = tokens.to(device)
+
             output = shared.model(input_ids=tokens)
             scores = output['logits'][-1][-1]
 
diff --git a/modules/models.py b/modules/models.py
index 7a52c07c..d906535b 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -21,11 +21,12 @@ from transformers import (
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
     BitsAndBytesConfig,
-    GPTQConfig
+    GPTQConfig,
+    is_torch_npu_available,
+    is_torch_xpu_available
 )
 
 import modules.shared as shared
-from modules import sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import get_model_metadata
 
@@ -56,8 +57,6 @@ if shared.args.deepspeed:
     ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
     dschf = HfDeepSpeedConfig(ds_config)  # Keep this object alive for the Transformers integration
 
-sampler_hijack.hijack_samplers()
-
 
 last_generation_time = time.time()
 
@@ -172,17 +171,9 @@ def huggingface_loader(model_name):
 
         model = LoaderClass.from_pretrained(path_to_model, **params)
         if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
-            if torch.backends.mps.is_available():
-                device = torch.device('mps')
+            device = get_device()
+            if device:
                 model = model.to(device)
-            elif is_xpu_available():
-                device = torch.device("xpu")
-                model = model.to(device)
-            elif is_npu_available():
-                device = torch.device("npu")
-                model = model.to(device)
-            else:
-                model = model.cuda()
 
     # DeepSpeed ZeRO-3
     elif shared.args.deepspeed:
@@ -380,13 +371,34 @@ def get_max_memory_dict():
     return max_memory if len(max_memory) > 0 else None
 
 
+def get_device():
+    if torch.cuda.is_available():
+        return torch.device('cuda')
+    elif shared.args.deepspeed:
+        import deepspeed
+        return deepspeed.get_accelerator().current_device_name()
+    elif torch.backends.mps.is_available():
+        return torch.device('mps')
+    elif is_torch_xpu_available():
+        return torch.device('xpu:0')
+    elif is_torch_npu_available():
+        return torch.device('npu:0')
+    else:
+        return None
+
+
 def clear_torch_cache():
     gc.collect()
     if not shared.args.cpu:
-        if is_xpu_available():
-            torch.xpu.empty_cache()
-        else:
+        if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        elif is_xpu_available():
+            torch.xpu.empty_cache()
+        elif is_npu_available():
+            torch.npu.empty_cache()
+        elif torch.backends.mps.is_available():
+            if hasattr(torch.backends.mps, 'empty_cache'):
+                torch.backends.mps.empty_cache()
 
 
 def unload_model(keep_model_name=False):
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 24dbcf2e..62ceca8d 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -5,7 +5,7 @@ import random
 
 import torch
 import transformers
-from transformers import LogitsWarper, is_torch_xpu_available
+from transformers import LogitsWarper
 from transformers.generation.logits_process import (
     LogitNormalization,
     LogitsProcessor,
@@ -14,6 +14,7 @@ from transformers.generation.logits_process import (
 
 from modules import shared
 from modules.logging_colors import logger
+from modules.models import get_device
 
 global_scores = None
 
@@ -339,12 +340,12 @@ class MirostatLogitsWarper(LogitsWarper):
                 break
 
         # Normalize the probabilities of the remaining words
-        if is_torch_xpu_available():
-            prob_topk = torch.softmax(sorted_logits, dim=0).to("xpu")
-            prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to("xpu")
-        else:
-            prob_topk = torch.softmax(sorted_logits, dim=0).to('cuda')
-            prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to('cuda')
+        prob_topk = torch.softmax(sorted_logits, dim=0)
+        prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True)
+        device = get_device()
+        if device:
+            prob_topk = prob_topk.to(device)
+            prev_i = prev_i.to(device)
 
         observed_surprise = -math.log2(prob_topk[prev_i])
         self.e = observed_surprise - self.mirostat_tau
diff --git a/modules/text_generation.py b/modules/text_generation.py
index c999fa81..db415dce 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -16,7 +16,7 @@ from transformers import (
 )
 
 import modules.shared as shared
-from modules import models
+from modules import models, sampler_hijack
 from modules.cache_utils import process_llamacpp_cache
 from modules.callbacks import (
     Iteratorize,
@@ -28,7 +28,9 @@ from modules.grammar.grammar_utils import initialize_grammar
 from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
 from modules.html_generator import generate_basic_html
 from modules.logging_colors import logger
-from modules.models import clear_torch_cache, load_model
+from modules.models import clear_torch_cache, get_device, load_model
+
+sampler_hijack.hijack_samplers()
 
 
 def generate_reply(*args, **kwargs):
@@ -159,18 +161,12 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
 
     if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
         return input_ids
-    elif shared.args.deepspeed:
-        import deepspeed
-        return input_ids.to(deepspeed.get_accelerator().current_device_name())
-    elif torch.backends.mps.is_available():
-        device = torch.device('mps')
-        return input_ids.to(device)
-    elif is_torch_xpu_available():
-        return input_ids.to("xpu:0")
-    elif is_torch_npu_available():
-        return input_ids.to("npu:0")
     else:
-        return input_ids.cuda()
+        device = get_device()
+        if device:
+            return input_ids.to(device)
+
+        return input_ids
 
 
 def decode(output_ids, skip_special_tokens=True):
@@ -328,7 +324,6 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
     # Encode the input
     input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
     output = input_ids[0]
-    cuda = not any((shared.args.cpu, shared.args.deepspeed))
     if state['auto_max_new_tokens']:
         generate_params['max_new_tokens'] = state['truncation_length'] - input_ids.shape[-1]
 
@@ -383,8 +378,9 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
         if not state['stream']:
             with torch.no_grad():
                 output = shared.model.generate(**generate_params)[0]
-                if cuda:
-                    output = output.cuda()
+                device = get_device()
+                if device:
+                    output = output.to(device)
 
             starting_from = 0 if shared.is_seq2seq else len(input_ids[0])
             yield get_reply_from_output_ids(output, state, starting_from=starting_from)

From 75f1b5ccde221783b4fedef92da6afc3a44824f0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Jan 2025 16:24:18 -0800
Subject: [PATCH 07/35] UI: add a "Branch chat" button

---
 css/main.css       |  1 +
 modules/chat.py    | 15 +++++++++++++++
 modules/ui_chat.py |  5 +++++
 3 files changed, 21 insertions(+)

diff --git a/css/main.css b/css/main.css
index a08350ad..870f4192 100644
--- a/css/main.css
+++ b/css/main.css
@@ -980,6 +980,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 #rename-row {
     width: 100%;
     justify-content: center;
+    gap: 10px;
 }
 
 
diff --git a/modules/chat.py b/modules/chat.py
index 2638c794..81328385 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1092,6 +1092,21 @@ def handle_delete_chat_confirm_click(state):
     ]
 
 
+def handle_branch_chat_click(state):
+    history = state['history']
+    new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
+    save_history(history, new_unique_id, state['character_menu'], state['mode'])
+
+    histories = find_all_histories_with_first_prompts(state)
+    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+
+    convert_to_markdown.cache_clear()
+
+    past_chats_update = gr.update(choices=histories, value=new_unique_id)
+
+    return [history, html, past_chats_update]
+
+
 def handle_rename_chat_click():
     return [
         gr.update(value="My New Chat"),
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index e372f5c2..b09d0081 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -24,6 +24,7 @@ def create_ui():
         with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
             with gr.Column():
                 with gr.Row(elem_id='past-chats-buttons'):
+                    shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
@@ -250,6 +251,10 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.handle_delete_chat_confirm_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'delete-chat-row'), show_progress=False)
 
+    shared.gradio['branch_chat'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
+
     shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False)
     shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False)
     shared.gradio['rename_to-confirm'].click(

From 973255cb0bc61c0cf50e6f562bfc632a3d6fab1c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Jan 2025 16:48:24 -0800
Subject: [PATCH 08/35] UI: fix codeblocks overflowing on mobile

---
 css/main.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/main.css b/css/main.css
index 870f4192..564a39a1 100644
--- a/css/main.css
+++ b/css/main.css
@@ -540,6 +540,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .message-body pre > code {
     white-space: pre !important;
     overflow-x: auto !important;
+    max-width: calc(100dvw - 39px);
     border: 1px solid #666;
     border-radius: 5px;
     font-size: 82%;

From b8fc9010fa4667c78a01adb6f1449e515e94709f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Jan 2025 16:57:04 -0800
Subject: [PATCH 09/35] UI: fix `orjson.JSONDecodeError` error on page reload

---
 modules/ui_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index b09d0081..ecebc28e 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -18,7 +18,7 @@ def create_ui():
     mu = shared.args.multi_user
 
     shared.gradio['Chat input'] = gr.State()
-    shared.gradio['history'] = gr.JSON({'internal': [], 'visible': []}, visible=False)
+    shared.gradio['history'] = gr.JSON(visible=False)
 
     with gr.Tab('Chat', elem_id='chat-tab'):
         with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):

From 4b3e1b37573057fe878ff48037569253b9df5576 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Jan 2025 18:46:40 -0800
Subject: [PATCH 10/35] UI: add a "Search chats" input field

---
 css/main.css       |  6 +++++-
 modules/chat.py    | 40 +++++++++++++++++++++++++---------------
 modules/ui.py      |  1 +
 modules/ui_chat.py |  6 ++++++
 4 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/css/main.css b/css/main.css
index 564a39a1..64da1602 100644
--- a/css/main.css
+++ b/css/main.css
@@ -876,6 +876,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     flex-shrink: 1;
 }
 
+#search_chat > :nth-child(2) > :first-child {
+    display: none;
+}
+
 /* ----------------------------------------------
   Keep dropdown menus above errored components
 ---------------------------------------------- */
@@ -911,7 +915,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #past-chats {
-    max-height: calc(100dvh - 90px);
+    max-height: calc(100dvh - 135px);
     overflow-y: scroll !important;
     border-radius: 0;
     scrollbar-width: auto;
diff --git a/modules/chat.py b/modules/chat.py
index 81328385..694c137b 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -593,21 +593,26 @@ def find_all_histories_with_first_prompts(state):
     result = []
     for i, path in enumerate(histories):
         filename = path.stem
-        if re.match(r'^[0-9]{8}-[0-9]{2}-[0-9]{2}-[0-9]{2}$', filename):
-            with open(path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
+        file_content = ""
+        with open(path, 'r', encoding='utf-8') as f:
+            file_content = f.read()
 
-                first_prompt = ""
-                if data and 'visible' in data and len(data['visible']) > 0:
-                    if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
-                        if len(data['visible']) > 1:
-                            first_prompt = html.unescape(data['visible'][1][0])
-                        elif i == 0:
-                            first_prompt = "New chat"
-                    else:
-                        first_prompt = html.unescape(data['visible'][0][0])
-                elif i == 0:
-                    first_prompt = "New chat"
+        if state['search_chat'] and state['search_chat'] not in file_content:
+            continue
+
+        data = json.loads(file_content)
+        if re.match(r'^[0-9]{8}-[0-9]{2}-[0-9]{2}-[0-9]{2}$', filename):
+            first_prompt = ""
+            if data and 'visible' in data and len(data['visible']) > 0:
+                if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
+                    if len(data['visible']) > 1:
+                        first_prompt = html.unescape(data['visible'][1][0])
+                    elif i == 0:
+                        first_prompt = "New chat"
+                else:
+                    first_prompt = html.unescape(data['visible'][0][0])
+            elif i == 0:
+                first_prompt = "New chat"
         else:
             first_prompt = filename
 
@@ -615,7 +620,7 @@ def find_all_histories_with_first_prompts(state):
 
         # Truncate the first prompt if it's longer than 30 characters
         if len(first_prompt) > 30:
-            first_prompt = first_prompt[:30-3] + '...'
+            first_prompt = first_prompt[:30 - 3] + '...'
 
         result.append((first_prompt, filename))
 
@@ -1124,6 +1129,11 @@ def handle_rename_chat_confirm(rename_to, state):
     ]
 
 
+def handle_search_chat_change(state):
+    histories = find_all_histories_with_first_prompts(state)
+    return gr.update(choices=histories)
+
+
 def handle_upload_chat_history(load_chat_history, state):
     history = start_new_chat(state)
     history = load_history_json(load_chat_history, history)
diff --git a/modules/ui.py b/modules/ui.py
index 4bfea9fa..a3bf520f 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -230,6 +230,7 @@ def list_interface_input_elements():
         'start_with',
         'character_menu',
         'history',
+        'search_chat',
         'unique_id',
         'name1',
         'user_bio',
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index ecebc28e..b92dd9ae 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -29,6 +29,8 @@ def create_ui():
                     shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
 
+                shared.gradio['search_chat'] = gr.Textbox(placeholder='Search chats...', max_lines=1, elem_id='search_chat')
+
                 with gr.Row(elem_id='delete-chat-row', visible=False) as shared.gradio['delete-chat-row']:
                     shared.gradio['delete_chat-cancel'] = gr.Button('Cancel', elem_classes=['refresh-button', 'focus-on-chat-input'])
                     shared.gradio['delete_chat-confirm'] = gr.Button('Confirm', variant='stop', elem_classes=['refresh-button', 'focus-on-chat-input'])
@@ -265,6 +267,10 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename-row'), show_progress=False)
 
+    shared.gradio['search_chat'].change(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_search_chat_change, gradio('interface_state'), gradio('unique_id'), show_progress=False)
+
     shared.gradio['load_chat_history'].upload(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.handle_upload_chat_history, gradio('load_chat_history', 'interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False).then(

From e2702200e131944e58e19a9894b8ebadb24c6f75 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 2 Jan 2025 19:26:50 -0800
Subject: [PATCH 11/35] UI: fix the font size of lists in chat mode

---
 css/chat_style-cai-chat.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
index d7b1ba88..93276bd3 100644
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@@ -9,6 +9,7 @@
 
 .message-body {
     margin-top: 3px;
+    font-size: 15px !important;
 }
 
 .circle-you {

From 3815f46838e4bb2542d32fbdc5151bc4f2a5abb7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 3 Jan 2025 04:35:29 -0800
Subject: [PATCH 12/35] UI: minor style improvements to chat tab

---
 css/main.css | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 64da1602..0ccaf106 100644
--- a/css/main.css
+++ b/css/main.css
@@ -839,7 +839,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
   Past chats menus
 ---------------------------------------------- */
 #rename-row label {
-    margin-top: var(--layout-gap);
+    margin-top: 0;
+}
+
+#rename-row > :nth-child(2) {
+    justify-content: center;
 }
 
 /* ----------------------------------------------

From 9f24885bd2f70f7d07742f766107983b301434af Mon Sep 17 00:00:00 2001
From: mamei16 <marcel.1710@live.de>
Date: Sat, 4 Jan 2025 19:41:31 +0100
Subject: [PATCH 13/35] Sane handling of markdown lists (#6626)

---
 modules/html_generator.py      |   7 +-
 modules/sane_markdown_lists.py | 336 +++++++++++++++++++++++++++++++++
 2 files changed, 340 insertions(+), 3 deletions(-)
 create mode 100644 modules/sane_markdown_lists.py

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 8160f8b6..e61fc558 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -9,6 +9,7 @@ import markdown
 from PIL import Image, ImageOps
 
 from modules import shared
+from modules.sane_markdown_lists import SaneListExtension
 from modules.utils import get_available_chat_styles
 
 # This is to store the paths to the thumbnails of the profile pictures
@@ -174,7 +175,7 @@ def convert_to_markdown(string):
             result += '\n'
         # Also don't add an extra \n for lists
         elif stripped_line.startswith('-') or stripped_line.startswith('*') or stripped_line.startswith('+') or stripped_line.startswith('>') or re.match(r'\d+\.', stripped_line):
-            result += '\n'
+            result += '  \n'
         else:
             result += '  \n'
 
@@ -195,7 +196,7 @@ def convert_to_markdown(string):
         result = re.sub(list_item_pattern, r'\g<1> ' + delete_str, result)
 
         # Convert to HTML using markdown
-        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'], tab_length=2)
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])
 
         # Remove the delete string from the HTML output
         pos = html_output.rfind(delete_str)
@@ -203,7 +204,7 @@ def convert_to_markdown(string):
             html_output = html_output[:pos] + html_output[pos + len(delete_str):]
     else:
         # Convert to HTML using markdown
-        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'], tab_length=2)
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])
 
     # Unescape code blocks
     pattern = re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL)
diff --git a/modules/sane_markdown_lists.py b/modules/sane_markdown_lists.py
new file mode 100644
index 00000000..1e1d76fd
--- /dev/null
+++ b/modules/sane_markdown_lists.py
@@ -0,0 +1,336 @@
+# Code based on the Sane List Extension for Python-Markdown
+# =======================================
+
+# Modify the behavior of Lists in Python-Markdown to act in a sane manner.
+
+# See https://Python-Markdown.github.io/extensions/sane_lists
+# for documentation.
+
+# Original code Copyright 2011 [Waylan Limberg](http://achinghead.com)
+
+# All changes Copyright 2011-2014 The Python Markdown Project
+
+# License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+"""
+Modify the behavior of Lists in Python-Markdown to act in a sane manner.
+"""
+
+from __future__ import annotations
+
+import re
+import xml.etree.ElementTree as etree
+from typing import TYPE_CHECKING
+
+from markdown import Extension
+from markdown.blockparser import BlockParser
+from markdown.blockprocessors import (
+    ListIndentProcessor,
+    OListProcessor,
+    ParagraphProcessor
+)
+
+if TYPE_CHECKING:  # pragma: no cover
+    from markdown import blockparser
+
+
+# The min. number of added leading spaces needed to start a nested list
+MIN_NESTED_LIST_INDENT = 2
+assert MIN_NESTED_LIST_INDENT > 1, "'MIN_NESTED_LIST_INDENT' must be > 1"
+
+
+class SaneListIndentProcessor(ListIndentProcessor):
+    """ Process children of list items.
+
+    Example
+
+        * a list item
+            process this part
+
+            or this part
+
+    """
+
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.INDENT_RE = re.compile(r'^(([ ])+)')
+
+    def test(self, parent: etree.Element, block: str) -> bool:
+        return block.startswith(' ' * MIN_NESTED_LIST_INDENT) and \
+            not self.parser.state.isstate('detabbed') and \
+            (parent.tag in self.ITEM_TYPES or
+                (len(parent) and parent[-1] is not None and
+                    (parent[-1].tag in self.LIST_TYPES)))
+
+    def get_level(self, parent: etree.Element, block: str) -> tuple[int, etree.Element]:
+        """ Get level of indentation based on list level. """
+        # Get indent level
+        m = self.INDENT_RE.match(block)
+        if m:
+            indent_level = len(m.group(1)) / MIN_NESTED_LIST_INDENT
+        else:
+            indent_level = 0
+        if self.parser.state.isstate('list'):
+            # We're in a tight-list - so we already are at correct parent.
+            level = 1
+        else:
+            # We're in a loose-list - so we need to find parent.
+            level = 0
+        # Step through children of tree to find matching indent level.
+        while indent_level > level:
+            child = self.lastChild(parent)
+            if (child is not None and
+                    (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)):
+                if child.tag in self.LIST_TYPES:
+                    level += 1
+                parent = child
+            else:
+                # No more child levels. If we're short of `indent_level`,
+                # we have a code block. So we stop here.
+                break
+        return level, parent
+
+    def detab(self, text: str, length: int | None = None) -> tuple[str, str]:
+        """ Remove a tab from the front of each line of the given text. """
+        if length is None:
+            length = MIN_NESTED_LIST_INDENT
+        newtext = []
+        lines = text.split('\n')
+        for line in lines:
+            if line.startswith(' ' * length):
+                newtext.append(line[length:])
+            elif not line.strip():
+                newtext.append('')
+            else:
+                break
+        return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
+
+    def looseDetab(self, text: str, level: int = 1) -> str:
+        """ Remove indentation from front of lines but allowing dedented lines. """
+        lines = text.split('\n')
+        for i in range(len(lines)):
+            if lines[i].startswith(' ' * MIN_NESTED_LIST_INDENT * level):
+                lines[i] = lines[i][MIN_NESTED_LIST_INDENT * level:]
+        return '\n'.join(lines)
+
+
+class SaneOListProcessor(OListProcessor):
+    """ Override `SIBLING_TAGS` to not include `ul` and set `LAZY_OL` to `False`. """
+
+    SIBLING_TAGS = ['ol']
+    """ Exclude `ul` from list of siblings. """
+    LAZY_OL = False
+    """ Disable lazy list behavior. """
+
+    def __init__(self, parser: blockparser.BlockParser):
+        super().__init__(parser)
+        # This restriction stems from the 'CodeBlockProcessor' class,
+        # which automatically matches blocks with an indent = self.tab_length
+        max_list_start_indent = self.tab_length - 1
+        # Detect an item (e.g., `1. item`)
+        self.RE = re.compile(r'^[ ]{0,%d}[\*_]{0,2}\d+\.[ ]+(.*)' % max_list_start_indent)
+        # Detect items on secondary lines. they can be of either list type.
+        self.CHILD_RE = re.compile(r'^[ ]{0,%d}([\*_]{0,2})((\d+\.))[ ]+(.*)' % (MIN_NESTED_LIST_INDENT - 1))
+        # Detect indented (nested) items of either type
+        self.INDENT_RE = re.compile(r'^[ ]{%d,%d}[\*_]{0,2}((\d+\.)|[*+-])[ ]+.*' %
+                                    (MIN_NESTED_LIST_INDENT, self.tab_length * 2 - 1))
+
+    def run(self, parent: etree.Element, blocks: list[str]) -> None:
+        # Check for multiple items in one block.
+        items = self.get_items(blocks.pop(0))
+        sibling = self.lastChild(parent)
+
+        if sibling is not None and sibling.tag in self.SIBLING_TAGS:
+            # Previous block was a list item, so set that as parent
+            lst = sibling
+            # make sure previous item is in a `p` - if the item has text,
+            # then it isn't in a `p`
+            if lst[-1].text:
+                # since it's possible there are other children for this
+                # sibling, we can't just `SubElement` the `p`, we need to
+                # insert it as the first item.
+                p = etree.Element('p')
+                p.text = lst[-1].text
+                lst[-1].text = ''
+                lst[-1].insert(0, p)
+            # if the last item has a tail, then the tail needs to be put in a `p`
+            # likely only when a header is not followed by a blank line
+            lch = self.lastChild(lst[-1])
+            if lch is not None and lch.tail:
+                p = etree.SubElement(lst[-1], 'p')
+                p.text = lch.tail.lstrip()
+                lch.tail = ''
+
+            # parse first block differently as it gets wrapped in a `p`.
+            li = etree.SubElement(lst, 'li')
+            self.parser.state.set('looselist')
+            firstitem = items.pop(0)
+            self.parser.parseBlocks(li, [firstitem])
+            self.parser.state.reset()
+        elif parent.tag in ['ol', 'ul']:
+            # this catches the edge case of a multi-item indented list whose
+            # first item is in a blank parent-list item:
+            #     * * subitem1
+            #         * subitem2
+            # see also `ListIndentProcessor`
+            lst = parent
+        else:
+            # This is a new list so create parent with appropriate tag.
+            lst = etree.SubElement(parent, self.TAG)
+            # Check if a custom start integer is set
+            if not self.LAZY_OL and self.STARTSWITH != '1':
+                lst.attrib['start'] = self.STARTSWITH
+
+        self.parser.state.set('list')
+        # Loop through items in block, recursively parsing each with the
+        # appropriate parent.
+        for item in items:
+            if item.startswith(" " * MIN_NESTED_LIST_INDENT):
+                # Item is indented. Parse with last item as parent
+                self.parser.parseBlocks(lst[-1], [item])
+            else:
+                # New item. Create `li` and parse with it as parent
+                li = etree.SubElement(lst, 'li')
+                self.parser.parseBlocks(li, [item])
+        self.parser.state.reset()
+
+    def looseDetab(self, text: str, indent_length: int, level: int = 1) -> str:
+        """ Remove indentation from front of lines but allowing dedented lines. """
+        lines = text.split('\n')
+        for i in range(len(lines)):
+            if lines[i].startswith(' ' * indent_length * level):
+                lines[i] = lines[i][indent_length * level:]
+        return '\n'.join(lines)
+
+    def get_items(self, block: str) -> list[str]:
+        """ Break a block into list items. """
+        # If first level of list is indented, remove that indentation
+        if (indent_len := len(block) - len(block.lstrip())) > 0:
+            block = self.looseDetab(block, indent_len)
+        items = []
+        for line in block.split('\n'):
+            m = self.CHILD_RE.match(line)
+            if m:
+                # This is a new list item
+                # Check first item for the start index
+                if not items:
+                    # Detect the integer value of first list item
+                    INTEGER_RE = re.compile(r'(\d+)')
+                    self.STARTSWITH = INTEGER_RE.match(m.group(2)).group()
+                # Append to the list
+                items.append(m.group(1) + m.group(4))
+            elif self.INDENT_RE.match(line):
+                # This is an indented (possibly nested) item.
+                if items[-1].startswith(' ' * MIN_NESTED_LIST_INDENT):
+                    # Previous item was indented. Append to that item.
+                    items[-1] = '{}\n{}'.format(items[-1], line)
+                else:
+                    items.append(line)
+            else:
+                # This is another line of previous item. Append to that item.
+                items[-1] = '{}\n{}'.format(items[-1], line)
+        return items
+
+
+class SaneUListProcessor(SaneOListProcessor):
+    """ Override `SIBLING_TAGS` to not include `ol`. """
+
+    TAG: str = 'ul'
+    SIBLING_TAGS = ['ul']
+    """ Exclude `ol` from list of siblings. """
+
+    def __init__(self, parser: blockparser.BlockParser):
+        super().__init__(parser)
+        # Detect an item (e.g., `- item` or `+ item` or `* item`).
+        max_list_start_indent = self.tab_length - 1
+        self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % max_list_start_indent)
+        self.CHILD_RE = re.compile(r'^[ ]{0,%d}(([*+-]))[ ]+(.*)' % (MIN_NESTED_LIST_INDENT - 1))
+
+    def get_items(self, block: str) -> list[str]:
+        """ Break a block into list items. """
+        # If first level of list is indented, remove that indentation
+        if (indent_len := len(block) - len(block.lstrip())) > 0:
+            block = self.looseDetab(block, indent_len)
+        items = []
+        for line in block.split('\n'):
+            m = self.CHILD_RE.match(line)
+            if m:
+                # Append to the list
+                items.append(m.group(3))
+            elif self.INDENT_RE.match(line):
+                # This is an indented (possibly nested) item.
+                if items[-1].startswith(' ' * MIN_NESTED_LIST_INDENT):
+                    # Previous item was indented. Append to that item.
+                    items[-1] = '{}\n{}'.format(items[-1], line)
+                else:
+                    items.append(line)
+            else:
+                # This is another line of previous item. Append to that item.
+                items[-1] = '{}\n{}'.format(items[-1], line)
+        return items
+
+
+class SaneParagraphProcessor(ParagraphProcessor):
+    """ Process Paragraph blocks. """
+
+    def __init__(self, parser: BlockParser):
+        super().__init__(parser)
+        max_list_start_indent = self.tab_length - 1
+        self.LIST_RE = re.compile(r"\s{2}\n(\s{0,%d}[\d+*-])" % max_list_start_indent)
+
+    def run(self, parent: etree.Element, blocks: list[str]) -> None:
+        block = blocks.pop(0)
+        if block.strip():
+            # Not a blank block. Add to parent, otherwise throw it away.
+            if self.parser.state.isstate('list'):
+                # The parent is a tight-list.
+                #
+                # Check for any children. This will likely only happen in a
+                # tight-list when a header isn't followed by a blank line.
+                # For example:
+                #
+                #     * # Header
+                #     Line 2 of list item - not part of header.
+                sibling = self.lastChild(parent)
+                if sibling is not None:
+                    # Insert after sibling.
+                    if sibling.tail:
+                        sibling.tail = '{}\n{}'.format(sibling.tail, block)
+                    else:
+                        sibling.tail = '\n%s' % block
+                else:
+                    # Append to parent.text
+                    if parent.text:
+                        parent.text = '{}\n{}'.format(parent.text, block)
+                    else:
+                        parent.text = block.lstrip()
+            else:
+                # Check if paragraph contains a list
+                next_list_block = None
+                if list_match := self.LIST_RE.search(block):
+                    list_start = list_match.end() - len(list_match.group(1))
+                    next_list_block = block[list_start:]
+                    block = block[:list_start]
+
+                # Create a regular paragraph
+                p = etree.SubElement(parent, 'p')
+                p.text = block.lstrip()
+
+                # If a list was found, parse its block separately with the paragraph as the parent
+                if next_list_block:
+                    self.parser.parseBlocks(p, [next_list_block])
+
+
+class SaneListExtension(Extension):
+    """ Add sane lists to Markdown. """
+
+    def extendMarkdown(self, md):
+        """ Override existing Processors. """
+        md.parser.blockprocessors.register(SaneListIndentProcessor(md.parser), 'indent', 90)
+        md.parser.blockprocessors.register(SaneOListProcessor(md.parser), 'olist', 40)
+        md.parser.blockprocessors.register(SaneUListProcessor(md.parser), 'ulist', 30)
+        md.parser.blockprocessors.register(SaneParagraphProcessor(md.parser), 'paragraph', 10)
+
+
+def makeExtension(**kwargs):  # pragma: no cover
+    return SaneListExtension(**kwargs)

From 0e673a7a4278deaedb3513b4e406cdcc45b19b4e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Jan 2025 11:40:24 -0800
Subject: [PATCH 14/35] UI: reduce the size of HTML sent to the UI during
 streaming

---
 modules/html_generator.py | 130 +++++++++++++++++---------------------
 1 file changed, 58 insertions(+), 72 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index e61fc558..1f12d77a 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -268,29 +268,24 @@ def generate_instruct_html(history):
     for i, _row in enumerate(history):
         row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
 
-        if row[0]:  # don't display empty user messages
-            output += f"""
-                  <div class="user-message">
-                    <div class="text">
-                      <div class="message-body">
-                        {row[0]}
-                      </div>
-                    </div>
-                  </div>
-                """
+        if row[0]:  # Don't display empty user messages
+            output += (
+                f'<div class="user-message">'
+                f'<div class="text">'
+                f'<div class="message-body">{row[0]}</div>'
+                f'</div>'
+                f'</div>'
+            )
 
-        output += f"""
-              <div class="assistant-message">
-                <div class="text">
-                  <div class="message-body">
-                    {row[1]}
-                  </div>
-                </div>
-              </div>
-            """
+        output += (
+            f'<div class="assistant-message">'
+            f'<div class="text">'
+            f'<div class="message-body">{row[1]}</div>'
+            f'</div>'
+            f'</div>'
+        )
 
     output += "</div></div>"
-
     return output
 
 
@@ -298,44 +293,39 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
     output = f'<style>{chat_styles[style]}</style><div class="chat" id="chat"><div class="messages">'
 
     # We use ?character and ?time.time() to force the browser to reset caches
-    img_bot = f'<img src="file/cache/pfp_character_thumb.png?{character}" class="pfp_character">' if Path("cache/pfp_character_thumb.png").exists() else ''
-    img_me = f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">' if Path("cache/pfp_me.png").exists() else ''
+    img_bot = (
+        f'<img src="file/cache/pfp_character_thumb.png?{character}" class="pfp_character">'
+        if Path("cache/pfp_character_thumb.png").exists() else ''
+    )
+
+    img_me = (
+        f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">'
+        if Path("cache/pfp_me.png").exists() else ''
+    )
 
     for i, _row in enumerate(history):
         row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
 
-        if row[0]:  # don't display empty user messages
-            output += f"""
-                  <div class="message">
-                    <div class="circle-you">
-                      {img_me}
-                    </div>
-                    <div class="text">
-                      <div class="username">
-                        {name1}
-                      </div>
-                      <div class="message-body">
-                        {row[0]}
-                      </div>
-                    </div>
-                  </div>
-                """
+        if row[0]:  # Don't display empty user messages
+            output += (
+                f'<div class="message">'
+                f'<div class="circle-you">{img_me}</div>'
+                f'<div class="text">'
+                f'<div class="username">{name1}</div>'
+                f'<div class="message-body">{row[0]}</div>'
+                f'</div>'
+                f'</div>'
+            )
 
-        output += f"""
-              <div class="message">
-                <div class="circle-bot">
-                  {img_bot}
-                </div>
-                <div class="text">
-                  <div class="username">
-                    {name2}
-                  </div>
-                  <div class="message-body">
-                    {row[1]}
-                  </div>
-                </div>
-              </div>
-            """
+        output += (
+            f'<div class="message">'
+            f'<div class="circle-bot">{img_bot}</div>'
+            f'<div class="text">'
+            f'<div class="username">{name2}</div>'
+            f'<div class="message-body">{row[1]}</div>'
+            f'</div>'
+            f'</div>'
+        )
 
     output += "</div></div>"
     return output
@@ -347,26 +337,22 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
     for i, _row in enumerate(history):
         row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
 
-        if row[0]:  # don't display empty user messages
-            output += f"""
-              <div class="message">
-                <div class="text-you">
-                  <div class="message-body">
-                    {row[0]}
-                  </div>
-                </div>
-              </div>
-            """
+        if row[0]:  # Don't display empty user messages
+            output += (
+                f'<div class="message">'
+                f'<div class="text-you">'
+                f'<div class="message-body">{row[0]}</div>'
+                f'</div>'
+                f'</div>'
+            )
 
-        output += f"""
-          <div class="message">
-            <div class="text-bot">
-              <div class="message-body">
-                {row[1]}
-              </div>
-            </div>
-          </div>
-        """
+        output += (
+            f'<div class="message">'
+            f'<div class="text-bot">'
+            f'<div class="message-body">{row[1]}</div>'
+            f'</div>'
+            f'</div>'
+        )
 
     output += "</div></div>"
     return output

From 049297fa660125467d8d2aed00f91901871dbf52 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Jan 2025 14:09:36 -0800
Subject: [PATCH 15/35] UI: reduce the size of CSS sent to the UI during
 streaming

---
 modules/html_generator.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 1f12d77a..e3550ed5 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -15,6 +15,29 @@ from modules.utils import get_available_chat_styles
 # This is to store the paths to the thumbnails of the profile pictures
 image_cache = {}
 
+
+def minify_css(css: str) -> str:
+    # Step 1: Remove comments
+    css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)
+
+    # Step 2: Remove leading and trailing whitespace
+    css = re.sub(r'^[ \t]*|[ \t]*$', '', css, flags=re.MULTILINE)
+
+    # Step 3: Remove spaces after specific characters ({ : ; ,})
+    css = re.sub(r'([:{;,])\s+', r'\1', css)
+
+    # Step 4: Remove spaces before `{`
+    css = re.sub(r'\s+{', '{', css)
+
+    # Step 5: Remove empty lines
+    css = re.sub(r'^\s*$', '', css, flags=re.MULTILINE)
+
+    # Step 6: Collapse all lines into one
+    css = re.sub(r'\n', '', css)
+
+    return css
+
+
 with open(Path(__file__).resolve().parent / '../css/html_readable_style.css', 'r') as f:
     readable_css = f.read()
 with open(Path(__file__).resolve().parent / '../css/html_instruct_style.css', 'r') as f:
@@ -35,6 +58,12 @@ for k in chat_styles:
         style = match.group(1)
         chat_styles[k] = chat_styles.get(style, '') + '\n\n' + '\n'.join(lines[1:])
 
+# Reduce the size of the CSS sources above
+readable_css = minify_css(readable_css)
+instruct_css = minify_css(instruct_css)
+for k in chat_styles:
+    chat_styles[k] = minify_css(chat_styles[k])
+
 
 def fix_newlines(string):
     string = string.replace('\n', '\n\n')

From d56b5005689d17472626872c6fd40fc4208dff30 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Jan 2025 16:22:40 -0800
Subject: [PATCH 16/35] UI: add padding to file saving dialog

---
 css/main.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/main.css b/css/main.css
index 0ccaf106..dc89fbef 100644
--- a/css/main.css
+++ b/css/main.css
@@ -226,6 +226,7 @@ button {
     max-width: 500px;
     background-color: var(--input-background-fill);
     border: var(--input-border-width) solid var(--input-border-color) !important;
+    padding: 10px;
 }
 
 .file-saver > :first-child > :last-child {

From 3967520e71ba0ab386893d7c7e946fd621e25b06 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Jan 2025 16:22:59 -0800
Subject: [PATCH 17/35] Connect XTC, DRY, smoothing_factor, and dynatemp to
 ExLlamaV2 loader (non-HF)

---
 modules/exllamav2.py      | 30 +++++++++++++++++++++++++++++-
 modules/loaders.py        | 12 +++++++++++-
 modules/sampler_hijack.py |  4 +++-
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 9b6da83c..0289bb21 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -1,8 +1,8 @@
+import json
 import traceback
 from pathlib import Path
 
 import torch
-
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Cache,
@@ -15,6 +15,7 @@ from exllamav2 import (
     ExLlamaV2Tokenizer
 )
 from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
+
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
@@ -122,6 +123,10 @@ class Exllamav2Model:
         settings.token_presence_penalty = state['presence_penalty']
 
         settings.temperature = state['temperature']
+        settings.smoothing_factor = state['smoothing_factor']
+        settings.min_temp = state['dynatemp_low'] if state['dynamic_temperature'] else 0
+        settings.max_temp = state['dynatemp_high'] if state['dynamic_temperature'] else 0
+        settings.temp_exponent = state['dynatemp_exponent']
         settings.top_k = state['top_k']
         settings.top_p = state['top_p']
         settings.top_a = state['top_a']
@@ -143,6 +148,29 @@ class Exllamav2Model:
             if len(to_ban) > 0:
                 settings.disallow_tokens(self.tokenizer, to_ban)
 
+        settings.dry_allowed_length = state['dry_allowed_length']
+        settings.dry_base = state['dry_base']
+        settings.dry_multiplier = state['dry_multiplier']
+
+        # Dry sequence breakers processing
+        if state['dry_multiplier'] > 0 and state['dry_sequence_breakers']:
+            dry_sequence_breakers = state['dry_sequence_breakers']
+
+            # Support both JSON array notation and comma-separated strings.
+            if not dry_sequence_breakers.startswith("["):
+                dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
+
+            sequence_breaker_strings = json.loads(dry_sequence_breakers)
+            # Prefix with 'a' to get the correct encoding of the token at the end of a text.
+            sequence_breakers = {
+                self.encode(f"a{s}")[0, -1].item() for s in sequence_breaker_strings
+            }
+
+            settings.dry_sequence_breakers = sequence_breakers
+
+        settings.xtc_probability = state['xtc_probability']
+        settings.xtc_threshold = state['xtc_threshold']
+
         ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True)
         ids = ids[:, -get_max_prompt_length(state):]
 
diff --git a/modules/loaders.py b/modules/loaders.py
index 1cfdb31b..a4edf822 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -194,6 +194,10 @@ loaders_samplers = {
     'ExLlamav2': {
         'temperature',
         'temperature_last',
+        'smoothing_factor',
+        'dynatemp_low',
+        'dynatemp_high',
+        'dynatemp_exponent',
         'top_p',
         'min_p',
         'top_k',
@@ -204,10 +208,16 @@ loaders_samplers = {
         'presence_penalty',
         'frequency_penalty',
         'repetition_penalty_range',
-        'seed',
         'mirostat_mode',
         'mirostat_tau',
         'mirostat_eta',
+        'dry_multiplier',
+        'dry_base',
+        'dry_allowed_length',
+        'dry_sequence_breakers',
+        'xtc_threshold',
+        'xtc_probability',
+        'seed',
         'ban_eos_token',
         'add_bos_token',
         'custom_token_bans',
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index 62ceca8d..d202af1f 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -495,7 +495,9 @@ def get_logits_processor_patch(self, **kwargs):
 
         sequence_breaker_strings = json.loads(dry_sequence_breakers)
         # Prefix with 'a' to get the correct encoding of the token at the end of a text.
-        sequence_breakers = {shared.tokenizer.encode(f'a{s}')[-1] for s in sequence_breaker_strings}
+        sequence_breakers = {
+            shared.tokenizer.encode(f'a{s}')[-1] for s in sequence_breaker_strings
+        }
 
         warpers.append(
             DRYLogitsProcessor(

From 11af199aff41d0863a06d3407a22c0a13fb2709b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 4 Jan 2025 17:52:57 -0800
Subject: [PATCH 18/35] Add a "Static KV cache" option for transformers

---
 extensions/openai/typing.py | 1 +
 modules/loaders.py          | 3 ++-
 modules/shared.py           | 1 +
 modules/text_generation.py  | 3 +++
 modules/ui.py               | 1 +
 modules/ui_parameters.py    | 1 +
 settings-template.yaml      | 1 +
 7 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index f63c1f39..dfac8e03 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -42,6 +42,7 @@ class GenerationOptions(BaseModel):
     truncation_length: int = 0
     max_tokens_second: int = 0
     prompt_lookup_num_tokens: int = 0
+    static_cache: bool = False
     custom_token_bans: str = ""
     sampler_priority: List[str] | str | None = Field(default=None, description="List of samplers where the first items will appear first in the stack. Example: [\"top_k\", \"temperature\", \"top_p\"].")
     auto_max_new_tokens: bool = False
diff --git a/modules/loaders.py b/modules/loaders.py
index a4edf822..e1a41bb1 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -183,7 +183,8 @@ def transformers_samplers():
         'add_bos_token',
         'skip_special_tokens',
         'auto_max_new_tokens',
-        'prompt_lookup_num_tokens'
+        'prompt_lookup_num_tokens',
+        'static_cache',
     }
 
 
diff --git a/modules/shared.py b/modules/shared.py
index cab61226..f2ae05a6 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -46,6 +46,7 @@ settings = {
     'max_tokens_second': 0,
     'max_updates_second': 0,
     'prompt_lookup_num_tokens': 0,
+    'static_cache': False,
     'custom_stopping_strings': '',
     'custom_token_bans': '',
     'auto_max_new_tokens': False,
diff --git a/modules/text_generation.py b/modules/text_generation.py
index db415dce..3e9788b8 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -302,6 +302,9 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
     if state['prompt_lookup_num_tokens'] > 0:
         generate_params['prompt_lookup_num_tokens'] = state['prompt_lookup_num_tokens']
 
+    if state['static_cache']:
+        generate_params['cache_implementation'] = 'static'
+
     for k in ['epsilon_cutoff', 'eta_cutoff']:
         if state[k] > 0:
             generate_params[k] = state[k] * 1e-4
diff --git a/modules/ui.py b/modules/ui.py
index a3bf520f..3c75f6ca 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -220,6 +220,7 @@ def list_interface_input_elements():
         'custom_stopping_strings',
         'skip_special_tokens',
         'stream',
+        'static_cache',
         'tfs',
         'top_a',
     ]
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 727a1528..f22f6233 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -83,6 +83,7 @@ def create_ui(default_preset):
                             shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
                             shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
                             shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
+                            shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache')
 
                         with gr.Column():
                             shared.gradio['truncation_length'] = gr.Number(precision=0, step=256, value=get_truncation_length(), label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
diff --git a/settings-template.yaml b/settings-template.yaml
index 59c76c35..d5ed47c3 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -22,6 +22,7 @@ ban_eos_token: false
 add_bos_token: true
 skip_special_tokens: true
 stream: true
+static_cache: false
 character: Assistant
 name1: You
 custom_system_message: ''

From c0f600c8879fb28c5b84dd74bc6384619bfcf4fa Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 5 Jan 2025 05:45:12 -0800
Subject: [PATCH 19/35] Add a --torch-compile flag for transformers

---
 modules/loaders.py       | 3 ++-
 modules/models.py        | 3 +++
 modules/shared.py        | 1 +
 modules/ui.py            | 3 ++-
 modules/ui_model_menu.py | 1 +
 5 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index e1a41bb1..191126b3 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -9,12 +9,13 @@ loaders_and_params = OrderedDict({
     'Transformers': [
         'cpu_memory',
         'gpu_memory',
+        'load_in_4bit',
         'load_in_8bit',
+        'torch_compile',
         'bf16',
         'cpu',
         'disk',
         'auto_devices',
-        'load_in_4bit',
         'use_double_quant',
         'quant_type',
         'compute_dtype',
diff --git a/modules/models.py b/modules/models.py
index d906535b..cb1ba218 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -254,6 +254,9 @@ def huggingface_loader(model_name):
         print()
         model = LoaderClass.from_pretrained(path_to_model, **params)
 
+    if shared.args.torch_compile:
+        model = torch.compile(model)
+
     return model
 
 
diff --git a/modules/shared.py b/modules/shared.py
index f2ae05a6..891c0556 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -104,6 +104,7 @@ group.add_argument('--force-safetensors', action='store_true', help='Set use_saf
 group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
 group.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
 group.add_argument('--use_eager_attention', action='store_true', help='Set attn_implementation= eager while loading the model.')
+group.add_argument('--torch-compile', action='store_true', help='Compile the model with torch.compile for improved performance.')
 
 # bitsandbytes 4-bit
 group = parser.add_argument_group('bitsandbytes 4-bit')
diff --git a/modules/ui.py b/modules/ui.py
index 3c75f6ca..30d4163c 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -109,12 +109,13 @@ def list_model_elements():
         'disk',
         'cpu',
         'bf16',
+        'load_in_4bit',
         'load_in_8bit',
+        'torch_compile',
         'trust_remote_code',
         'no_use_fast',
         'use_flash_attention_2',
         'use_eager_attention',
-        'load_in_4bit',
         'compute_dtype',
         'quant_type',
         'use_double_quant',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index c4bb8f01..f2814401 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -108,6 +108,7 @@ def create_ui():
                             shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
+                            shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
                             shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')

From 03a0f236a483d2b05094cc6e5546cfa28bd1668b Mon Sep 17 00:00:00 2001
From: FP HAM <fartypantsham@gmail.com>
Date: Wed, 8 Jan 2025 14:54:09 -0500
Subject: [PATCH 20/35] Training_PRO fix: add `if 'quantization_config' in
 shared.model.config.to_dict()`

---
 extensions/Training_PRO/script.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/extensions/Training_PRO/script.py b/extensions/Training_PRO/script.py
index 5365154c..01bcf67d 100644
--- a/extensions/Training_PRO/script.py
+++ b/extensions/Training_PRO/script.py
@@ -789,7 +789,11 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
         logger.info("Getting model ready...")
         # here we can disable gradient checkpoint, by default = true,  use_gradient_checkpointing=True
-        prepare_model_for_kbit_training(shared.model)
+        if 'quantization_config' in shared.model.config.to_dict():
+            print(f"Method: {RED}QLORA{RESET}")
+            prepare_model_for_kbit_training(shared.model)
+        else:
+            print(f"Method: {RED}LoRA{RESET}")    
 
     # base model is now frozen and should not be reused for any other LoRA training than this one
     shared.model_dirty_from_training = True

From 1f867229770b902939cd319a9950802b42c4a104 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:56:55 -0300
Subject: [PATCH 21/35] Update safetensors requirement from ==0.4.* to ==0.5.*
 (#6634)

---
 requirements.txt                 | 2 +-
 requirements_amd.txt             | 2 +-
 requirements_amd_noavx2.txt      | 2 +-
 requirements_apple_intel.txt     | 2 +-
 requirements_apple_silicon.txt   | 2 +-
 requirements_cpu_only.txt        | 2 +-
 requirements_cpu_only_noavx2.txt | 2 +-
 requirements_noavx2.txt          | 2 +-
 requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 8b36f54f..fb8ca6b3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,7 +17,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 1b85b20f..88b4b887 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 1bafc141..ef43da99 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 345368b7..369d34d8 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index a3c3055b..9eefd8b8 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index a4b7882c..f5313ecf 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 878aea06..aefe31d0 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 1a345611..9e1633ca 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -17,7 +17,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 95fe3add..45003f0d 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard

From d3adcbf64b782ee04262af1fca8eb1a837a1ea67 Mon Sep 17 00:00:00 2001
From: Jack Cloudman <maylo360xd@gmail.com>
Date: Wed, 8 Jan 2025 14:30:21 -0600
Subject: [PATCH 22/35] Add `--exclude-pattern` flag to download-model.py
 script (#6542)

---
 download-model.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/download-model.py b/download-model.py
index 306784a3..8fe94371 100644
--- a/download-model.py
+++ b/download-model.py
@@ -72,7 +72,7 @@ class ModelDownloader:
 
         return model, branch
 
-    def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None):
+    def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None, exclude_pattern=None):
         session = self.session
         page = f"/api/models/{model}/tree/{branch}"
         cursor = b""
@@ -100,13 +100,17 @@ class ModelDownloader:
                 if specific_file not in [None, ''] and fname != specific_file:
                     continue
 
+                # Exclude files matching the exclude pattern
+                if exclude_pattern is not None and re.match(exclude_pattern, fname):
+                    continue
+
                 if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
                     is_lora = True
 
                 is_pytorch = re.match(r"(pytorch|adapter|gptq)_model.*\.bin", fname)
                 is_safetensors = re.match(r".*\.safetensors", fname)
                 is_pt = re.match(r".*\.pt", fname)
-                is_gguf = re.match(r'.*\.gguf', fname)
+                is_gguf = re.match(r".*\.gguf", fname)
                 is_tiktoken = re.match(r".*\.tiktoken", fname)
                 is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname) or is_tiktoken
                 is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
@@ -140,7 +144,6 @@ class ModelDownloader:
 
         # If both pytorch and safetensors are available, download safetensors only
         # Also if GGUF and safetensors are available, download only safetensors
-        # (why do people do this?)
         if (has_pytorch or has_pt or has_gguf) and has_safetensors:
             has_gguf = False
             for i in range(len(classifications) - 1, -1, -1):
@@ -148,8 +151,6 @@ class ModelDownloader:
                     links.pop(i)
 
         # For GGUF, try to download only the Q4_K_M if no specific file is specified.
-        # If not present, exclude all GGUFs, as that's likely a repository with both
-        # GGUF and fp16 files.
         if has_gguf and specific_file is None:
             has_q4km = False
             for i in range(len(classifications) - 1, -1, -1):
@@ -312,6 +313,7 @@ if __name__ == '__main__':
     parser.add_argument('--threads', type=int, default=4, help='Number of files to download simultaneously.')
     parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
     parser.add_argument('--specific-file', type=str, default=None, help='Name of the specific file to download (if not provided, downloads all).')
+    parser.add_argument('--exclude-pattern', type=str, default=None, help='Regex pattern to exclude files from download.')
     parser.add_argument('--output', type=str, default=None, help='Save the model files to this folder.')
     parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/models).')
     parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
@@ -322,6 +324,7 @@ if __name__ == '__main__':
     branch = args.branch
     model = args.MODEL
     specific_file = args.specific_file
+    exclude_pattern = args.exclude_pattern
 
     if model is None:
         print("Error: Please specify the model you'd like to download (e.g. 'python download-model.py facebook/opt-1.3b').")
@@ -336,7 +339,9 @@ if __name__ == '__main__':
         sys.exit()
 
     # Get the download links from Hugging Face
-    links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=args.text_only, specific_file=specific_file)
+    links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(
+        model, branch, text_only=args.text_only, specific_file=specific_file, exclude_pattern=exclude_pattern
+    )
 
     # Get the output folder
     if args.output:
@@ -349,4 +354,7 @@ if __name__ == '__main__':
         downloader.check_model_files(model, branch, links, sha256, output_folder)
     else:
         # Download files
-        downloader.download_model_files(model, branch, links, sha256, output_folder, specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp)
+        downloader.download_model_files(
+            model, branch, links, sha256, output_folder,
+            specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp
+        )

From 7157257c3f4691b9e4b56ceddb8c428802ac4c54 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Wed, 8 Jan 2025 19:28:56 -0300
Subject: [PATCH 23/35] Remove the AutoGPTQ loader (#6641)

---
 README.md                  |  2 +-
 modules/AutoGPTQ_loader.py | 74 --------------------------------------
 modules/LoRA.py            | 38 ++------------------
 modules/loaders.py         | 21 -----------
 modules/models.py          | 29 +--------------
 modules/models_settings.py | 41 +++------------------
 modules/shared.py          | 23 +++++-------
 modules/ui.py              |  8 -----
 modules/ui_model_menu.py   |  9 -----
 one_click.py               |  2 +-
 10 files changed, 19 insertions(+), 228 deletions(-)
 delete mode 100644 modules/AutoGPTQ_loader.py

diff --git a/README.md b/README.md
index 0e16aa30..bec686ad 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported but you need to install them manually.
+- Supports multiple text generation backends in a single UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile), and the Transformers loader is compatible with libraries like [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM), but they must be installed manually.
 - OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - Automatic prompt formatting using Jinja2 templates.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
deleted file mode 100644
index 69e8f299..00000000
--- a/modules/AutoGPTQ_loader.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from pathlib import Path
-
-from accelerate.utils import is_xpu_available
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-
-import modules.shared as shared
-from modules.logging_colors import logger
-from modules.models import get_max_memory_dict
-
-
-def load_quantized(model_name):
-    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
-    pt_path = None
-
-    # Find the model checkpoint
-    if shared.args.checkpoint:
-        pt_path = Path(shared.args.checkpoint)
-    else:
-        for ext in ['.safetensors', '.pt', '.bin']:
-            found = list(path_to_model.glob(f"*{ext}"))
-            if len(found) > 0:
-                if len(found) > 1:
-                    logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
-
-                pt_path = found[-1]
-                break
-
-    if pt_path is None:
-        logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")
-        return
-
-    use_safetensors = pt_path.suffix == '.safetensors'
-    if not (path_to_model / "quantize_config.json").exists():
-        quantize_config = BaseQuantizeConfig(
-            bits=bits if (bits := shared.args.wbits) > 0 else 4,
-            group_size=gs if (gs := shared.args.groupsize) > 0 else -1,
-            desc_act=shared.args.desc_act
-        )
-    else:
-        quantize_config = None
-
-    # Define the params for AutoGPTQForCausalLM.from_quantized
-    params = {
-        'model_basename': pt_path.stem,
-        'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
-        'use_triton': shared.args.triton,
-        'inject_fused_attention': False,
-        'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
-        'use_safetensors': use_safetensors,
-        'trust_remote_code': shared.args.trust_remote_code,
-        'max_memory': get_max_memory_dict(),
-        'quantize_config': quantize_config,
-        'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
-        'disable_exllama': shared.args.disable_exllama,
-        'disable_exllamav2': shared.args.disable_exllamav2,
-    }
-
-    logger.info(f"The AutoGPTQ params are: {params}")
-    model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
-
-    # These lines fix the multimodal extension when used with AutoGPTQ
-    if hasattr(model, 'model'):
-        if not hasattr(model, 'dtype'):
-            if hasattr(model.model, 'dtype'):
-                model.dtype = model.model.dtype
-
-        if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'):
-            if not hasattr(model, 'embed_tokens'):
-                model.embed_tokens = model.model.model.embed_tokens
-
-            if not hasattr(model.model, 'embed_tokens'):
-                model.model.embed_tokens = model.model.model.embed_tokens
-
-    return model
diff --git a/modules/LoRA.py b/modules/LoRA.py
index e1ad01d7..1f4883e2 100644
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@@ -2,13 +2,11 @@ from pathlib import Path
 
 import modules.shared as shared
 from modules.logging_colors import logger
-from modules.models import get_device, reload_model
+from modules.models import get_device
 
 
 def add_lora_to_model(lora_names):
-    if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ':
-        add_lora_autogptq(lora_names)
-    elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
+    if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
         add_lora_exllamav2(lora_names)
     else:
         add_lora_transformers(lora_names)
@@ -48,38 +46,6 @@ def add_lora_exllamav2(lora_names):
         shared.model.loras = None
 
 
-def add_lora_autogptq(lora_names):
-    '''
-    Adapted from https://github.com/Ph0rk0z/text-generation-webui-testing
-    '''
-
-    try:
-        from auto_gptq import get_gptq_peft_model
-        from auto_gptq.utils.peft_utils import GPTQLoraConfig
-    except:
-        logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.")
-        return
-
-    if len(lora_names) == 0:
-        reload_model()
-
-        shared.lora_names = []
-        return
-    else:
-        if len(lora_names) > 1:
-            logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
-
-        peft_config = GPTQLoraConfig(
-            inference_mode=True,
-        )
-
-        lora_path = get_lora_path(lora_names[0])
-        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
-        shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
-        shared.lora_names = [lora_names[0]]
-        return
-
-
 def add_lora_transformers(lora_names):
 
     from peft import PeftModel
diff --git a/modules/loaders.py b/modules/loaders.py
index 191126b3..4e331dbb 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -25,8 +25,6 @@ loaders_and_params = OrderedDict({
         'use_eager_attention',
         'alpha_value',
         'compress_pos_emb',
-        'disable_exllama',
-        'disable_exllamav2',
     ],
     'llama.cpp': [
         'n_ctx',
@@ -107,24 +105,6 @@ loaders_and_params = OrderedDict({
         'compress_pos_emb',
         'exllamav2_info',
     ],
-    'AutoGPTQ': [
-        'triton',
-        'no_inject_fused_mlp',
-        'no_use_cuda_fp16',
-        'wbits',
-        'groupsize',
-        'desc_act',
-        'disable_exllama',
-        'disable_exllamav2',
-        'gpu_memory',
-        'cpu_memory',
-        'cpu',
-        'disk',
-        'auto_devices',
-        'trust_remote_code',
-        'no_use_fast',
-        'autogptq_info',
-    ],
     'HQQ': [
         'hqq_backend',
         'trust_remote_code',
@@ -191,7 +171,6 @@ def transformers_samplers():
 
 loaders_samplers = {
     'Transformers': transformers_samplers(),
-    'AutoGPTQ': transformers_samplers(),
     'HQQ': transformers_samplers(),
     'ExLlamav2': {
         'temperature',
diff --git a/modules/models.py b/modules/models.py
index cb1ba218..9c58b279 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -3,7 +3,6 @@ import os
 import pprint
 import re
 import time
-import traceback
 from pathlib import Path
 
 import torch
@@ -21,7 +20,6 @@ from transformers import (
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
     BitsAndBytesConfig,
-    GPTQConfig,
     is_torch_npu_available,
     is_torch_xpu_available
 )
@@ -73,7 +71,6 @@ def load_model(model_name, loader=None):
         'llamacpp_HF': llamacpp_HF_loader,
         'ExLlamav2': ExLlamav2_loader,
         'ExLlamav2_HF': ExLlamav2_HF_loader,
-        'AutoGPTQ': AutoGPTQ_loader,
         'HQQ': HQQ_loader,
         'TensorRT-LLM': TensorRT_LLM_loader,
     }
@@ -164,7 +161,7 @@ def huggingface_loader(model_name):
             LoaderClass = AutoModelForCausalLM
 
     # Load the model without any special settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama, shared.args.disable_exllamav2]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1]):
         logger.info("TRANSFORMERS_PARAMS=")
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
         print()
@@ -229,21 +226,6 @@ def huggingface_loader(model_name):
             if shared.args.disk:
                 params['offload_folder'] = shared.args.disk_cache_dir
 
-        if shared.args.disable_exllama or shared.args.disable_exllamav2:
-            try:
-                gptq_config = GPTQConfig(
-                    bits=config.quantization_config.get('bits', 4),
-                    disable_exllama=shared.args.disable_exllama,
-                    disable_exllamav2=shared.args.disable_exllamav2,
-                )
-
-                params['quantization_config'] = gptq_config
-                logger.info(f'Loading with disable_exllama={shared.args.disable_exllama} and disable_exllamav2={shared.args.disable_exllamav2}.')
-            except:
-                exc = traceback.format_exc()
-                logger.error('Failed to disable exllama. Does the config.json for this model contain the necessary quantization info?')
-                print(exc)
-
         if shared.args.compress_pos_emb > 1:
             params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
         elif shared.args.alpha_value > 1:
@@ -310,15 +292,6 @@ def ExLlamav2_HF_loader(model_name):
     return Exllamav2HF.from_pretrained(model_name)
 
 
-def AutoGPTQ_loader(model_name):
-    try:
-        import modules.AutoGPTQ_loader
-    except ModuleNotFoundError:
-        raise ModuleNotFoundError("Failed to import 'autogptq'. Please install it manually following the instructions in the AutoGPTQ GitHub repository.")
-
-    return modules.AutoGPTQ_loader.load_quantized(model_name)
-
-
 def HQQ_loader(model_name):
     try:
         from hqq.core.quantize import HQQBackend, HQQLinear
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 1bb00ceb..8d658523 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -11,9 +11,6 @@ def get_fallback_settings():
     return {
         'bf16': False,
         'use_eager_attention': False,
-        'wbits': 'None',
-        'groupsize': 'None',
-        'desc_act': False,
         'max_seq_len': 2048,
         'n_ctx': 2048,
         'rope_freq_base': 0,
@@ -111,26 +108,6 @@ def get_model_metadata(model):
             if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']:
                 model_settings['use_eager_attention'] = True
 
-            # Read GPTQ metadata for old GPTQ loaders
-            if 'quantization_config' in metadata and metadata['quantization_config'].get('quant_method', '') != 'exl2':
-                if 'bits' in metadata['quantization_config']:
-                    model_settings['wbits'] = metadata['quantization_config']['bits']
-                if 'group_size' in metadata['quantization_config']:
-                    model_settings['groupsize'] = metadata['quantization_config']['group_size']
-                if 'desc_act' in metadata['quantization_config']:
-                    model_settings['desc_act'] = metadata['quantization_config']['desc_act']
-
-        # Read AutoGPTQ metadata
-        path = Path(f'{shared.args.model_dir}/{model}/quantize_config.json')
-        if path.exists():
-            metadata = json.loads(open(path, 'r', encoding='utf-8').read())
-            if 'bits' in metadata:
-                model_settings['wbits'] = metadata['bits']
-            if 'group_size' in metadata:
-                model_settings['groupsize'] = metadata['group_size']
-            if 'desc_act' in metadata:
-                model_settings['desc_act'] = metadata['desc_act']
-
     # Try to find the Jinja instruct template
     path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
     if path.exists():
@@ -178,7 +155,7 @@ def infer_loader(model_name, model_settings):
     path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
     if not path_to_model.exists():
         loader = None
-    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
+    elif (path_to_model / 'quantize_config.json').exists():  # Old GPTQ metadata file
         loader = 'ExLlamav2_HF'
     elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
         loader = 'llamacpp_HF'
@@ -215,16 +192,11 @@ def update_model_parameters(state, initial=False):
         if initial and element in shared.provided_arguments:
             continue
 
-        # Setting null defaults
-        if element in ['wbits', 'groupsize'] and value == 'None':
-            value = vars(shared.args_defaults)[element]
-        elif element in ['cpu_memory'] and value == 0:
+        if element in ['cpu_memory'] and value == 0:
             value = vars(shared.args_defaults)[element]
 
         # Making some simple conversions
-        if element in ['wbits', 'groupsize']:
-            value = int(value)
-        elif element == 'cpu_memory' and value is not None:
+        if element == 'cpu_memory' and value is not None:
             value = f"{value}MiB"
 
         setattr(shared.args, element, value)
@@ -251,15 +223,12 @@ def apply_model_settings_to_state(model, state):
         loader = model_settings.pop('loader')
 
         # If the user is using an alternative loader for the same model type, let them keep using it
-        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2', 'AutoGPTQ']):
+        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
             state['loader'] = loader
 
     for k in model_settings:
         if k in state:
-            if k in ['wbits', 'groupsize']:
-                state[k] = str(model_settings[k])
-            else:
-                state[k] = model_settings[k]
+            state[k] = model_settings[k]
 
     return state
 
diff --git a/modules/shared.py b/modules/shared.py
index 891c0556..6a83baae 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -86,7 +86,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
 
 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@@ -147,17 +147,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
 group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
 group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
 
-# AutoGPTQ
-group = parser.add_argument_group('AutoGPTQ')
-group.add_argument('--triton', action='store_true', help='Use triton.')
-group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
-group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
-group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
-group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
-group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
-group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
-group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
-
 # HQQ
 group = parser.add_argument_group('HQQ')
 group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
@@ -220,6 +209,14 @@ group.add_argument('--no_inject_fused_attention', action='store_true', help='DEP
 group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED')
 group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED')
 group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED')
+group.add_argument('--triton', action='store_true', help='DEPRECATED')
+group.add_argument('--no_inject_fused_mlp', action='store_true', help='DEPRECATED')
+group.add_argument('--no_use_cuda_fp16', action='store_true', help='DEPRECATED')
+group.add_argument('--desc_act', action='store_true', help='DEPRECATED')
+group.add_argument('--disable_exllama', action='store_true', help='DEPRECATED')
+group.add_argument('--disable_exllamav2', action='store_true', help='DEPRECATED')
+group.add_argument('--wbits', type=int, default=0, help='DEPRECATED')
+group.add_argument('--groupsize', type=int, default=-1, help='DEPRECATED')
 
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@@ -262,8 +259,6 @@ def fix_loader_name(name):
         return 'llamacpp_HF'
     elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
         return 'Transformers'
-    elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']:
-        return 'AutoGPTQ'
     elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
         return 'ExLlama'
     elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
diff --git a/modules/ui.py b/modules/ui.py
index 30d4163c..e66de434 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -119,14 +119,6 @@ def list_model_elements():
         'compute_dtype',
         'quant_type',
         'use_double_quant',
-        'wbits',
-        'groupsize',
-        'triton',
-        'desc_act',
-        'no_inject_fused_mlp',
-        'no_use_cuda_fp16',
-        'disable_exllama',
-        'disable_exllamav2',
         'cfg_cache',
         'no_flash_attn',
         'no_xformers',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index f2814401..eac0cba6 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -89,8 +89,6 @@ def create_ui():
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-                            shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
-                            shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
                             shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
@@ -121,10 +119,6 @@ def create_ui():
                             shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                             shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
-                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
-                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
-                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
-                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
                             shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
                             shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
@@ -136,13 +130,10 @@ def create_ui():
                             shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                             shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
                             shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
-                            shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.')
-                            shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
                             shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
-                            shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
                             shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
             with gr.Column():
diff --git a/one_click.py b/one_click.py
index 8fc1edf0..ca11efac 100644
--- a/one_click.py
+++ b/one_click.py
@@ -394,7 +394,7 @@ def update_requirements(initial_installation=False, pull=True):
         textgen_requirements = [
             req.replace('+cu121', '+cu118').replace('+cu122', '+cu118')
             for req in textgen_requirements
-            if "auto-gptq" not in req.lower() and "autoawq" not in req.lower()
+            if "autoawq" not in req.lower()
         ]
 
     if is_windows() and is_cuda118:  # No flash-attention on Windows for CUDA 11

From ad118056b87ff03f4892baed34c605851db3a923 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:29:46 -0800
Subject: [PATCH 24/35] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bec686ad..07138772 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-- Supports multiple text generation backends in a single UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile), and the Transformers loader is compatible with libraries like [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM), but they must be installed manually.
+- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile), and the Transformers loader is compatible with libraries like [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM), but they must be installed manually.
 - OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - Automatic prompt formatting using Jinja2 templates.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.

From 91a8a878878e57f221ca3992ec8c425cf3805d88 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Jan 2025 15:07:21 -0800
Subject: [PATCH 25/35] Remove obsolete code

---
 modules/shared.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 6a83baae..7829e462 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -259,8 +259,6 @@ def fix_loader_name(name):
         return 'llamacpp_HF'
     elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
         return 'Transformers'
-    elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
-        return 'ExLlama'
     elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
         return 'ExLlamav2'
     elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:

From b9e2ded6d4ea32ceccea87d1f0a37ed003f0316e Mon Sep 17 00:00:00 2001
From: nclok1405 <155463060+nclok1405@users.noreply.github.com>
Date: Thu, 9 Jan 2025 09:17:31 +0900
Subject: [PATCH 26/35] Added UnicodeDecodeError workaround for
 modules/llamacpp_model.py (#6040)

---------

Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
---
 modules/llamacpp_model.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 6a76ee4e..c79755e4 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -122,7 +122,14 @@ class LlamaCppModel:
         return self.model.tokenize(string)
 
     def decode(self, ids, **kwargs):
-        return self.model.detokenize(ids).decode('utf-8')
+        detokenized = self.model.detokenize(ids)
+        try:
+            # Attempt strict UTF-8 decoding first
+            return detokenized.decode('utf-8', 'strict')
+        except UnicodeDecodeError as e:
+            # Log the error and fall back to UTF-8 with replacement
+            logger.warning(f"Invalid UTF-8 in detokenized output. Using replacement characters.\n{e}")
+            return detokenized.decode('utf-8', 'replace')
 
     def get_logits(self, tokens):
         self.model.reset()

From e6796c3859350a082174436922709818d2aa74ef Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Jan 2025 17:24:21 -0800
Subject: [PATCH 27/35] Bump llama-cpp-python to 0.3.6, add macOS 14 and 15
 wheels

---
 requirements.txt                 | 24 ++++++++++++------------
 requirements_amd.txt             | 12 ++++++------
 requirements_amd_noavx2.txt      |  8 ++++----
 requirements_apple_intel.txt     |  6 ++++--
 requirements_apple_silicon.txt   | 10 ++++++----
 requirements_cpu_only.txt        |  8 ++++----
 requirements_cpu_only_noavx2.txt |  8 ++++----
 requirements_noavx2.txt          | 24 ++++++++++++------------
 8 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index fb8ca6b3..6539161c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,22 +32,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 88b4b887..2e5f2da7 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -31,14 +31,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.5+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.5+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.6+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.6+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index ef43da99..b1eb7d31 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -31,10 +31,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 369d34d8..6a9bf7f7 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -31,6 +31,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 9eefd8b8..d8928d58 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -31,8 +31,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index f5313ecf..84658a11 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index aefe31d0..5944d5a7 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 9e1633ca..fda4292d 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -32,22 +32,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

From 4ffc9ffc7a84c39f2442d060eff531cf8f70a084 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Jan 2025 17:24:38 -0800
Subject: [PATCH 28/35] UI: fix a list style

---
 css/main.css | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index dc89fbef..6febb471 100644
--- a/css/main.css
+++ b/css/main.css
@@ -500,8 +500,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     margin-bottom: 0.5em !important;
 }
 
-.message-body ul.long-list li,
-.message-body ol.long-list li {
+.message-body ul.long-list > li,
+.message-body ol.long-list > li {
     margin-top: 1.25em !important;
     margin-bottom: 1.25em !important;
 }

From 5c89068168e6a5e65d55832e7a725f4688bc9434 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Jan 2025 17:36:30 -0800
Subject: [PATCH 29/35] UI: add an info message for the new Static KV cache
 option

---
 modules/ui_parameters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index f22f6233..c8fd6bc7 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -83,7 +83,7 @@ def create_ui(default_preset):
                             shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
                             shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
                             shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
-                            shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache')
+                            shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')
 
                         with gr.Column():
                             shared.gradio['truncation_length'] = gr.Number(precision=0, step=256, value=get_truncation_length(), label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')

From 619265b32c9c436cd2687849ba2c56fa65a18033 Mon Sep 17 00:00:00 2001
From: BPplays <58504799+BPplays@users.noreply.github.com>
Date: Thu, 9 Jan 2025 05:23:44 -0800
Subject: [PATCH 30/35] add ipv6 support to the API (#6559)

---
 extensions/openai/script.py | 36 ++++++++++++++++++++++++++----------
 modules/shared.py           |  2 ++
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 03d99e8d..f23caf9b 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -353,23 +353,38 @@ async def handle_unload_loras():
 
 
 def run_server():
-    server_addr = '0.0.0.0' if shared.args.listen else '127.0.0.1'
+    # Parse configuration
     port = int(os.environ.get('OPENEDAI_PORT', shared.args.api_port))
-
     ssl_certfile = os.environ.get('OPENEDAI_CERT_PATH', shared.args.ssl_certfile)
     ssl_keyfile = os.environ.get('OPENEDAI_KEY_PATH', shared.args.ssl_keyfile)
 
+    # In the server configuration:
+    server_addrs = []
+    if os.environ.get('OPENEDAI_ENABLE_IPV6', shared.args.api_enable_ipv6):
+        server_addrs.append('[::]' if shared.args.listen else '[::1]')
+    if not os.environ.get('OPENEDAI_DISABLE_IPV4', shared.args.api_disable_ipv4):
+        server_addrs.append('0.0.0.0' if shared.args.listen else '127.0.0.1')
+
+    if not server_addrs:
+        raise Exception('you MUST enable IPv6 or IPv4 for the API to work')
+
+    # Log server information
     if shared.args.public_api:
-        def on_start(public_url: str):
-            logger.info(f'OpenAI-compatible API URL:\n\n{public_url}\n')
-
-        _start_cloudflared(port, shared.args.public_api_id, max_attempts=3, on_start=on_start)
+        _start_cloudflared(
+            port,
+            shared.args.public_api_id,
+            max_attempts=3,
+            on_start=lambda url: logger.info(f'OpenAI-compatible API URL:\n\n{url}\n')
+        )
     else:
-        if ssl_keyfile and ssl_certfile:
-            logger.info(f'OpenAI-compatible API URL:\n\nhttps://{server_addr}:{port}\n')
+        url_proto = 'https://' if (ssl_certfile and ssl_keyfile) else 'http://'
+        urls = [f'{url_proto}{addr}:{port}' for addr in server_addrs]
+        if len(urls) > 1:
+            logger.info('OpenAI-compatible API URLs:\n\n' + '\n'.join(urls) + '\n')
         else:
-            logger.info(f'OpenAI-compatible API URL:\n\nhttp://{server_addr}:{port}\n')
+            logger.info('OpenAI-compatible API URL:\n\n' + '\n'.join(urls) + '\n')
 
+    # Log API keys
     if shared.args.api_key:
         if not shared.args.admin_key:
             shared.args.admin_key = shared.args.api_key
@@ -379,8 +394,9 @@ def run_server():
     if shared.args.admin_key and shared.args.admin_key != shared.args.api_key:
         logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')
 
+    # Start server
     logging.getLogger("uvicorn.error").propagate = False
-    uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
+    uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
 
 
 def setup():
diff --git a/modules/shared.py b/modules/shared.py
index 7829e462..a0070b1f 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -193,6 +193,8 @@ group.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudf
 group.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.')
 group.add_argument('--api-key', type=str, default='', help='API authentication key.')
 group.add_argument('--admin-key', type=str, default='', help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.')
+group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 for the API')
+group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API')
 group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')
 
 # Multimodal

From 03b4067f3197de0ef753d2fe415785e0a5dffebb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Jan 2025 11:58:33 -0800
Subject: [PATCH 31/35] Installer: ask 1 question for NVIDIA users instead of 2

---
 docker/amd/Dockerfile    |  2 +-
 docker/cpu/Dockerfile    |  2 +-
 docker/intel/Dockerfile  |  2 +-
 docker/nvidia/Dockerfile |  2 +-
 one_click.py             | 45 +++++++++++++++++++++-------------------
 5 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/docker/amd/Dockerfile b/docker/amd/Dockerfile
index 365e88e3..cfbcf7e4 100644
--- a/docker/amd/Dockerfile
+++ b/docker/amd/Dockerfile
@@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=B USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=C LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui
diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
index 04ccf94a..8f643d2f 100644
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@@ -17,7 +17,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=N USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=N LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime
diff --git a/docker/intel/Dockerfile b/docker/intel/Dockerfile
index bc67a185..d2ed671e 100644
--- a/docker/intel/Dockerfile
+++ b/docker/intel/Dockerfile
@@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=D USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=E LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime
diff --git a/docker/nvidia/Dockerfile b/docker/nvidia/Dockerfile
index 66a717a7..900a4329 100644
--- a/docker/nvidia/Dockerfile
+++ b/docker/nvidia/Dockerfile
@@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui
diff --git a/one_click.py b/one_click.py
index ca11efac..e78a2450 100644
--- a/one_click.py
+++ b/one_click.py
@@ -232,33 +232,45 @@ def get_user_choice(question, options_dict):
 
 
 def install_webui():
-
     # Ask the user for the GPU vendor
     if "GPU_CHOICE" in os.environ:
         choice = os.environ["GPU_CHOICE"].upper()
         print_big_message(f"Selected GPU choice \"{choice}\" based on the GPU_CHOICE environment variable.")
+
+        # Warn about changed meanings and handle old NVIDIA choice
+        if choice == "B":
+            print_big_message("Warning: GPU_CHOICE='B' now means 'NVIDIA (CUDA 11.8)' in the new version.")
+        elif choice == "C":
+            print_big_message("Warning: GPU_CHOICE='C' now means 'AMD' in the new version.")
+        elif choice == "D":
+            print_big_message("Warning: GPU_CHOICE='D' now means 'Apple M Series' in the new version.")
+        elif choice == "A" and "USE_CUDA118" in os.environ:
+            choice = "B" if os.environ.get("USE_CUDA118", "").lower() in ("yes", "y", "true", "1", "t", "on") else "A"
     else:
         choice = get_user_choice(
             "What is your GPU?",
             {
-                'A': 'NVIDIA',
-                'B': 'AMD (Linux/MacOS only. Requires ROCm SDK 6.1 on Linux)',
-                'C': 'Apple M Series',
-                'D': 'Intel Arc (IPEX)',
-                'N': 'None (I want to run models in CPU mode)'
+                'A': 'NVIDIA - CUDA 12.1 (recommended)',
+                'B': 'NVIDIA - CUDA 11.8 (legacy GPUs)',
+                'C': 'AMD - Linux/macOS only, requires ROCm 6.1',
+                'D': 'Apple M Series',
+                'E': 'Intel Arc (beta)',
+                'N': 'CPU mode'
             },
         )
 
+    # Convert choices to GPU names for compatibility
     gpu_choice_to_name = {
         "A": "NVIDIA",
-        "B": "AMD",
-        "C": "APPLE",
-        "D": "INTEL",
+        "B": "NVIDIA",
+        "C": "AMD",
+        "D": "APPLE",
+        "E": "INTEL",
         "N": "NONE"
     }
 
     selected_gpu = gpu_choice_to_name[choice]
-    use_cuda118 = "N"
+    use_cuda118 = (choice == "B")  # CUDA version is now determined by menu choice
 
     # Write a flag to CMD_FLAGS.txt for CPU mode
     if selected_gpu == "NONE":
@@ -267,18 +279,9 @@ def install_webui():
                 print_big_message("Adding the --cpu flag to CMD_FLAGS.txt.")
                 cmd_flags_file.write("\n--cpu\n")
 
-    # Check if the user wants CUDA 11.8
+    # Handle CUDA version display
     elif any((is_windows(), is_linux())) and selected_gpu == "NVIDIA":
-        if "USE_CUDA118" in os.environ:
-            use_cuda118 = "Y" if os.environ.get("USE_CUDA118", "").lower() in ("yes", "y", "true", "1", "t", "on") else "N"
-        else:
-            print("\nDo you want to use CUDA 11.8 instead of 12.1?\nOnly choose this option if your GPU is very old (Kepler or older).\n\nFor RTX and GTX series GPUs, say \"N\".\nIf unsure, say \"N\".\n")
-            use_cuda118 = input("Input (Y/N)> ").upper().strip('"\'').strip()
-            while use_cuda118 not in 'YN':
-                print("Invalid choice. Please try again.")
-                use_cuda118 = input("Input> ").upper().strip('"\'').strip()
-
-        if use_cuda118 == 'Y':
+        if use_cuda118:
             print("CUDA: 11.8")
         else:
             print("CUDA: 12.1")

From c08d87b78db0acf13cb3d7ced94f8c9b5b78ac91 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Jan 2025 12:23:38 -0800
Subject: [PATCH 32/35] Make the huggingface loader more readable

---
 modules/models.py | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index 9c58b279..f551b828 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -160,8 +160,22 @@ def huggingface_loader(model_name):
         else:
             LoaderClass = AutoModelForCausalLM
 
+    # Determine if we should use default loading
+    should_use_default_loading = not any([
+        shared.args.cpu,
+        shared.args.load_in_8bit,
+        shared.args.load_in_4bit,
+        shared.args.auto_devices,
+        shared.args.disk,
+        shared.args.deepspeed,
+        shared.args.gpu_memory is not None,
+        shared.args.cpu_memory is not None,
+        shared.args.compress_pos_emb > 1,
+        shared.args.alpha_value > 1,
+    ])
+
     # Load the model without any special settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1]):
+    if should_use_default_loading:
         logger.info("TRANSFORMERS_PARAMS=")
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
         print()
@@ -174,8 +188,20 @@ def huggingface_loader(model_name):
 
     # DeepSpeed ZeRO-3
     elif shared.args.deepspeed:
-        model = LoaderClass.from_pretrained(path_to_model, torch_dtype=params['torch_dtype'], trust_remote_code=params.get('trust_remote_code'))
-        model = deepspeed.initialize(model=model, config_params=ds_config, model_parameters=None, optimizer=None, lr_scheduler=None)[0]
+        model = LoaderClass.from_pretrained(
+            path_to_model,
+            torch_dtype=params['torch_dtype'],
+            trust_remote_code=params.get('trust_remote_code')
+        )
+
+        model = deepspeed.initialize(
+            model=model,
+            config_params=ds_config,
+            model_parameters=None,
+            optimizer=None,
+            lr_scheduler=None
+        )[0]
+
         model.module.eval()  # Inference
         logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
 
@@ -197,16 +223,15 @@ def huggingface_loader(model_name):
                 # and https://huggingface.co/blog/4bit-transformers-bitsandbytes
                 quantization_config_params = {
                     'load_in_4bit': True,
-                    'bnb_4bit_compute_dtype': eval("torch.{}".format(shared.args.compute_dtype)) if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
+                    'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
                     'bnb_4bit_quant_type': shared.args.quant_type,
                     'bnb_4bit_use_double_quant': shared.args.use_double_quant,
                     'llm_int8_enable_fp32_cpu_offload': True
                 }
-
                 params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
 
             elif shared.args.load_in_8bit:
-                if any((shared.args.auto_devices, shared.args.gpu_memory)):
+                if shared.args.auto_devices or shared.args.gpu_memory:
                     params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
                 else:
                     params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)

From 3020f2e5ec823eef91475bcbcc929b1b2d9ca614 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Jan 2025 12:44:03 -0800
Subject: [PATCH 33/35] UI: improve the info message about --tensorcores

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index eac0cba6..d5116938 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -103,7 +103,7 @@ def create_ui():
                             shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
 
                         with gr.Column():
-                            shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.')
+                            shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled without GGML_CUDA_FORCE_MMQ. This may improve performance on newer cards.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                             shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')

From 0e94d7075e5404537320b1a17c0a37bf2164a187 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Jan 2025 13:12:30 -0800
Subject: [PATCH 34/35] UI: minor style fix on Windows

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 6febb471..9d99a876 100644
--- a/css/main.css
+++ b/css/main.css
@@ -990,7 +990,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 #rename-row {
     width: 100%;
     justify-content: center;
-    gap: 10px;
+    gap: 9px;
 }
 
 

From f3c0f964a2ef51c0828495b88c654a0c0ab32b42 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Jan 2025 13:18:23 -0800
Subject: [PATCH 35/35] Lint

---
 modules/callbacks.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index 2c04cc53..0f918f3d 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -1,11 +1,9 @@
-import gc
 import traceback
 from queue import Queue
 from threading import Thread
 
 import torch
 import transformers
-from transformers import is_torch_npu_available, is_torch_xpu_available
 
 import modules.shared as shared