From 9424ba17c8e5b8dcd5d89b5cd1735e5d210bea4e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 22 Apr 2025 19:56:42 -0700
Subject: [PATCH 01/49] UI: show only part 00001 of multipart GGUF models in
 the model menu

---
 modules/utils.py | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/modules/utils.py b/modules/utils.py
index f6be7541..269561aa 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -76,44 +76,54 @@ def get_available_models():
     # Get all GGUF files
     gguf_files = get_available_ggufs()
 
+    # Filter out non-first parts of multipart GGUF files
+    filtered_gguf_files = []
+    for gguf_path in gguf_files:
+        filename = os.path.basename(gguf_path)
+
+        match = re.search(r'-(\d+)-of-\d+\.gguf$', filename)
+
+        if match:
+            part_number = match.group(1)
+            # Keep only if it's part 1
+            if part_number.lstrip("0") == "1":
+                filtered_gguf_files.append(gguf_path)
+        else:
+            # Not a multi-part file
+            filtered_gguf_files.append(gguf_path)
+
     model_dir = Path(shared.args.model_dir)
 
     # Find top-level directories containing GGUF files
     dirs_with_gguf = set()
     for gguf_path in gguf_files:
         path = Path(gguf_path)
-        if path.parts:  # If in a subdirectory
-            dirs_with_gguf.add(path.parts[0])  # Add top-level directory
+        if path.parts:
+            dirs_with_gguf.add(path.parts[0])
 
-    # Find directories with safetensors files directly under them
+    # Find directories with safetensors files
     dirs_with_safetensors = set()
     for item in os.listdir(model_dir):
         item_path = model_dir / item
         if item_path.is_dir():
-            # Check if there are safetensors files directly under this directory
             if any(file.lower().endswith(('.safetensors', '.pt')) for file in os.listdir(item_path) if (item_path / file).is_file()):
                 dirs_with_safetensors.add(item)
 
     # Find valid model directories
     model_dirs = []
-
     for item in os.listdir(model_dir):
         item_path = model_dir / item
-
-        # Skip if not a directory
         if not item_path.is_dir():
             continue
 
-        # Include directory if it either:
-        # 1. Doesn't contain GGUF files, OR
-        # 2. Contains both GGUF and safetensors files
+        # Include directory if it either doesn't contain GGUF files
+        # or contains both GGUF and safetensors files
         if item not in dirs_with_gguf or item in dirs_with_safetensors:
             model_dirs.append(item)
 
     model_dirs = sorted(model_dirs, key=natural_keys)
 
-    # Combine all models
-    return ['None'] + gguf_files + model_dirs
+    return ['None'] + filtered_gguf_files + model_dirs
 
 
 def get_available_ggufs():

From e99c20bcb036c47255d652026bb28f877b29651b Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Wed, 23 Apr 2025 20:10:16 -0300
Subject: [PATCH 02/49] llama.cpp: Add speculative decoding (#6891)

---
 css/main.css                |  8 ++++++++
 modules/llama_cpp_server.py | 20 ++++++++++++++++++++
 modules/loaders.py          |  6 ++++++
 modules/shared.py           |  9 +++++++++
 modules/training.py         |  4 ++--
 modules/ui.py               |  5 +++++
 modules/ui_model_menu.py    | 11 +++++++++++
 7 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index a3fa9753..1545a74b 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1291,3 +1291,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .dark .footer-button:hover svg {
     stroke: rgb(209 213 219);
 }
+
+.tgw-accordion {
+    padding: 10px 12px !important;
+}
+
+.dark .tgw-accordion {
+    border: 1px solid var(--border-color-dark);
+}
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index c88f945d..ecc543f3 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -6,6 +6,7 @@ import subprocess
 import sys
 import threading
 import time
+from pathlib import Path
 
 import llama_cpp_binaries
 import requests
@@ -281,6 +282,25 @@ class LlamaServer:
             cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
         if shared.args.rope_freq_base > 0:
             cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
+        if shared.args.model_draft not in [None, 'None']:
+            path = Path(shared.args.model_draft)
+            if not path.exists():
+                path = Path(f'{shared.args.model_dir}/{shared.args.model_draft}')
+
+            if path.is_file():
+                model_file = path
+            else:
+                model_file = sorted(Path(f'{shared.args.model_dir}/{shared.args.model_draft}').glob('*.gguf'))[0]
+
+            cmd += ["--model-draft", model_file]
+            if shared.args.draft_max > 0:
+                cmd += ["--draft-max", str(shared.args.draft_max)]
+            if shared.args.gpu_layers_draft > 0:
+                cmd += ["--gpu-layers-draft", str(shared.args.gpu_layers_draft)]
+            if shared.args.device_draft:
+                cmd += ["--device-draft", shared.args.device_draft]
+            if shared.args.ctx_size_draft > 0:
+                cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
 
         env = os.environ.copy()
         if os.name == 'posix':
diff --git a/modules/loaders.py b/modules/loaders.py
index 7d6afe80..167b2c98 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -20,6 +20,12 @@ loaders_and_params = OrderedDict({
         'no_mmap',
         'mlock',
         'numa',
+        'model_draft',
+        'draft_max',
+        'gpu_layers_draft',
+        'device_draft',
+        'ctx_size_draft',
+        'speculative_decoding_accordion'
     ],
     'Transformers': [
         'gpu_split',
diff --git a/modules/shared.py b/modules/shared.py
index 08268ae0..e531cd3c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -13,6 +13,7 @@ from modules.logging_colors import logger
 model = None
 tokenizer = None
 model_name = 'None'
+draft_model_name = 'None'
 is_seq2seq = False
 model_dirty_from_training = False
 lora_names = []
@@ -127,6 +128,14 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
 group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
 
+# Speculative decoding
+group = parser.add_argument_group('Speculative decoding')
+group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.')
+group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to draft for speculative decoding.')
+group.add_argument('--gpu-layers-draft', type=int, default=0, help='Number of layers to offload to the GPU for the draft model.')
+group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model.')
+group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
+
 # ExLlamaV2
 group = parser.add_argument_group('ExLlamaV2')
 group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
diff --git a/modules/training.py b/modules/training.py
index c6c380a3..69142463 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -52,7 +52,7 @@ def create_ui():
                         with gr.Column():
                             always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).', elem_classes=['no-background'])
 
-                    with gr.Accordion(label='Target Modules', open=False):
+                    with gr.Accordion(label='Target Modules', open=False, elem_classes='tgw-accordion'):
                         gr.Markdown("Selects which modules to target in training. Targeting more modules is closer to a full fine-tune at the cost of increased VRAM requirements and adapter size.\nNOTE: Only works for model_id='llama', other types will retain default training behavior and not use these settings.")
                         with gr.Row():
                             with gr.Column():
@@ -86,7 +86,7 @@ def create_ui():
                             with gr.Row():
                                 lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.', elem_classes=['slim-dropdown'])
 
-                    with gr.Accordion(label='Advanced Options', open=False):
+                    with gr.Accordion(label='Advanced Options', open=False, elem_classes='tgw-accordion'):
                         with gr.Row():
                             with gr.Column():
                                 lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
diff --git a/modules/ui.py b/modules/ui.py
index d5caaeaa..6fc5e955 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -145,6 +145,11 @@ def list_model_elements():
         'cpp_runner',
         'trust_remote_code',
         'no_use_fast',
+        'model_draft',
+        'draft_max',
+        'gpu_layers_draft',
+        'device_draft',
+        'ctx_size_draft',
     ]
 
     return elements
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index b4af771c..1b0c25fa 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -92,6 +92,17 @@ def create_ui():
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
+                            # Speculative decoding
+                            with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
+                                with gr.Row():
+                                    shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.draft_model_name, elem_classes='slim-dropdown', interactive=not mu)
+                                    ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
+
+                                shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
+                                shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
+                                shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model.')
+                                shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
+
             with gr.Column():
                 with gr.Row():
                     shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)

From bfbde73409b185f5743b173906cf69edaf4447d0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Apr 2025 07:08:49 -0700
Subject: [PATCH 03/49] Make 'instruct' the default chat mode

---
 modules/shared.py      | 2 +-
 modules/ui_chat.py     | 2 +-
 settings-template.yaml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index e531cd3c..356a2bb9 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -32,7 +32,7 @@ need_restart = False
 settings = {
     'show_controls': True,
     'start_with': '',
-    'mode': 'chat-instruct',
+    'mode': 'instruct',
     'chat_style': 'cai-chat',
     'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
     'prompt-default': 'QA',
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index a830abfb..b823b8e5 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -88,7 +88,7 @@ def create_ui():
                     shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
 
                 with gr.Row():
-                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
+                    shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
 
                 with gr.Row():
                     shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
diff --git a/settings-template.yaml b/settings-template.yaml
index 0343df0a..94a5c034 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -1,6 +1,6 @@
 show_controls: true
 start_with: ''
-mode: chat-instruct
+mode: instruct
 chat_style: cai-chat
 chat-instruct_command: |-
   Continue the chat dialogue below. Write a single reply for the character "<|character|>".

From c71a2af5ab3a558ca554d497219386e2d76bedfd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Apr 2025 08:19:46 -0700
Subject: [PATCH 04/49] Handle CMD_FLAGS.txt in the main code (closes #6896)

---
 modules/shared.py | 16 ++++++++++++++++
 one_click.py      | 10 ++--------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 356a2bb9..79925909 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -1,6 +1,7 @@
 import argparse
 import copy
 import os
+import shlex
 import sys
 from collections import OrderedDict
 from pathlib import Path
@@ -201,6 +202,21 @@ group.add_argument('--nowebui', action='store_true', help='Do not launch the Gra
 # Deprecated parameters
 group = parser.add_argument_group('Deprecated')
 
+# Handle CMD_FLAGS.txt
+cmd_flags_path = Path(__file__).parent.parent / "CMD_FLAGS.txt"
+if cmd_flags_path.exists():
+    with cmd_flags_path.open('r', encoding='utf-8') as f:
+        cmd_flags = ' '.join(
+            line.strip().rstrip('\\').strip()
+            for line in f
+            if line.strip().rstrip('\\').strip() and not line.strip().startswith('#')
+        )
+
+    if cmd_flags:
+        # Command-line takes precedence over CMD_FLAGS.txt
+        sys.argv = [sys.argv[0]] + shlex.split(cmd_flags) + sys.argv[1:]
+
+
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
 provided_arguments = []
diff --git a/one_click.py b/one_click.py
index 04b729eb..5e3d691b 100644
--- a/one_click.py
+++ b/one_click.py
@@ -28,14 +28,7 @@ conda_env_path = os.path.join(script_dir, "installer_files", "env")
 state_file = '.installer_state.json'
 
 # Command-line flags
-cmd_flags_path = os.path.join(script_dir, "CMD_FLAGS.txt")
-if os.path.exists(cmd_flags_path):
-    with open(cmd_flags_path, 'r') as f:
-        CMD_FLAGS = ' '.join(line.strip().rstrip('\\').strip() for line in f if line.strip().rstrip('\\').strip() and not line.strip().startswith('#'))
-else:
-    CMD_FLAGS = ''
-
-flags = f"{' '.join([flag for flag in sys.argv[1:] if flag != '--update-wizard'])} {CMD_FLAGS}"
+flags = f"{' '.join([flag for flag in sys.argv[1:] if flag != '--update-wizard'])}"
 
 
 def signal_handler(sig, frame):
@@ -300,6 +293,7 @@ def install_webui():
 
     # Write a flag to CMD_FLAGS.txt for CPU mode
     if selected_gpu == "NONE":
+        cmd_flags_path = os.path.join(script_dir, "CMD_FLAGS.txt")
         with open(cmd_flags_path, 'r+') as cmd_flags_file:
             if "--cpu" not in cmd_flags_file.read():
                 print_big_message("Adding the --cpu flag to CMD_FLAGS.txt.")

From b313adf653059b1e3bc58e1755f7dc99f9584859 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Apr 2025 08:26:12 -0700
Subject: [PATCH 05/49] Bump llama.cpp, make the wheels work with any Python >=
 3.7

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_amd.txt             | 2 +-
 requirements/portable/requirements_amd_noavx2.txt      | 2 +-
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 18 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index b9afaa07..74b1089a 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 96cb299d..e75fcfd4 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -29,6 +29,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 0f1a4fc2..a57fac6e 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -29,6 +29,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 8d1e5294..2c5b5780 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index a44ff3cb..01f45dcb 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -29,8 +29,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 35855162..07845af2 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 0716455e..ffa2b6eb 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 98c43b88..2152ff3e 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index c3336fc7..fcd221e1 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 4855225f..3564b3bf 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -15,4 +15,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt
index f40daa8a..98afdc67 100644
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@@ -15,4 +15,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 1ede251e..433c1f17 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 26b68bff..9d9c6852 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -15,6 +15,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 456a0499..dd25a9d5 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 7cd2dd34..dd7f740c 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index b47b8bbc..7542e897 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 15834f89..4810bd50 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkan-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkan-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index afb9e90f..d1787e68 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkanavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkanavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From 60ac495d59c42d7d3d798910f00f7b922315b09f Mon Sep 17 00:00:00 2001
From: Ziya <32464406+ZiyaCu@users.noreply.github.com>
Date: Thu, 24 Apr 2025 18:42:05 +0300
Subject: [PATCH 06/49] extensions/superboogav2: existing embedding check bug
 fix (#6898)

---
 extensions/superboogav2/chromadb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/superboogav2/chromadb.py b/extensions/superboogav2/chromadb.py
index c9e450e4..6e93dd92 100644
--- a/extensions/superboogav2/chromadb.py
+++ b/extensions/superboogav2/chromadb.py
@@ -148,7 +148,7 @@ class ChromaCollector():
             id_ = new_ids[i]
             metadata = metadatas[i] if metadatas is not None else None
             embedding = self.embeddings_cache.get(text)
-            if embedding is not None and embedding.any():
+            if embedding is not None and any(embedding):
                 existing_texts.append(text)
                 existing_embeddings.append(embedding)
                 existing_ids.append(id_)

From f1b64df8ddd79833e685b22ed7447da86b5d7e46 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Apr 2025 09:03:49 -0700
Subject: [PATCH 07/49] EXL2: add another torch.cuda.synchronize() call to
 prevent errors

---
 modules/text_generation.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 40046eb2..4e3d1d7a 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -264,6 +264,11 @@ def apply_stopping_strings(reply, all_stop_strings):
 
 
 def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
+    import torch
+
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
     reply = decode(output_ids[starting_from:], state['skip_special_tokens'] if state else True)
 
     # Handle tokenizers that do not add the leading space for the first token

From 93fd4ad25d4b98439c4ba2abecc7362ed9b8bd27 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Apr 2025 09:20:11 -0700
Subject: [PATCH 08/49] llama.cpp: Document the --device-draft syntax

---
 modules/shared.py        | 2 +-
 modules/ui_model_menu.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 79925909..eeaeb689 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -134,7 +134,7 @@ group = parser.add_argument_group('Speculative decoding')
 group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.')
 group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to draft for speculative decoding.')
 group.add_argument('--gpu-layers-draft', type=int, default=0, help='Number of layers to offload to the GPU for the draft model.')
-group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model.')
+group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
 group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
 
 # ExLlamaV2
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 1b0c25fa..56b6903f 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -100,7 +100,7 @@ def create_ui():
 
                                 shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
                                 shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
-                                shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model.')
+                                shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
                                 shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
 
             with gr.Column():

From 8ebe8689163abe89668bbd7bf8a5b77c210cf6e0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Apr 2025 09:31:15 -0700
Subject: [PATCH 09/49] Fix typos in b313adf653059b1e3bc58e1755f7dc99f9584859

---
 requirements/full/requirements_amd.txt        | 2 +-
 requirements/full/requirements_amd_noavx2.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index e75fcfd4..c8e90190 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -30,5 +30,5 @@ tiktoken
 
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index a57fac6e..d994f6e2 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -30,5 +30,5 @@ tiktoken
 
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

From 8f2493cc60036648aedc38d9d8721993743f86e7 Mon Sep 17 00:00:00 2001
From: Matthew Jenkins <40323108+Matthew-Jenkins@users.noreply.github.com>
Date: Thu, 24 Apr 2025 22:38:57 -0400
Subject: [PATCH 10/49] Prevent llamacpp defaults from locking up consumer
 hardware (#6870)

---
 modules/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/shared.py b/modules/shared.py
index eeaeb689..98ec50b2 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -120,7 +120,7 @@ group.add_argument('--flash-attn', action='store_true', help='Use flash-attentio
 group.add_argument('--n_ctx', type=int, default=8192, help='Size of the prompt context.')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
 group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
-group.add_argument('--batch-size', type=int, default=2048, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
+group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
 group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
 group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')

From ae1fe8736549b8bea2dee4daebad66f5828860c8 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Fri, 25 Apr 2025 00:11:04 -0300
Subject: [PATCH 11/49] ExLlamaV2: Add speculative decoding (#6899)

---
 modules/exllamav2.py | 50 +++++++++++++++++++++++++++++++++++++++++++-
 modules/loaders.py   |  5 ++++-
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 0289bb21..7d79e516 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -85,7 +85,44 @@ class Exllamav2Model:
             model.load_autosplit(cache)
 
         tokenizer = ExLlamaV2Tokenizer(config)
-        generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
+
+        # Initialize draft model for speculative decoding
+        draft_model = None
+        draft_cache = None
+
+        if shared.args.model_draft and shared.args.model_draft.lower() not in ["none", ""]:
+            logger.info(f"Loading draft model for speculative decoding: {shared.args.model_draft}")
+
+            # Find the draft model path
+            draft_path = Path(shared.args.model_draft)
+            if not draft_path.exists():
+                draft_path = Path(f'{shared.args.model_dir}') / Path(shared.args.model_draft)
+
+            draft_config = ExLlamaV2Config()
+            draft_config.model_dir = str(draft_path)
+            draft_config.prepare()
+            draft_config.arch_compat_overrides()
+
+            # Set context size for draft model
+            if shared.args.ctx_size_draft > 0:
+                draft_config.max_seq_len = shared.args.ctx_size_draft
+            else:
+                draft_config.max_seq_len = config.max_seq_len
+
+            draft_model = ExLlamaV2(draft_config)
+            draft_cache = cache_type(draft_model, lazy=True)
+            draft_model.load_autosplit(draft_cache)
+
+            logger.info(f"Draft model loaded successfully with max_draft={shared.args.draft_max}")
+
+        generator = ExLlamaV2StreamingGenerator(
+            model,
+            cache,
+            tokenizer,
+            draft_model=draft_model,
+            draft_cache=draft_cache,
+            num_speculative_tokens=shared.args.draft_max if draft_model is not None else 0
+        )
 
         result = self()
         result.model = model
@@ -93,6 +130,8 @@ class Exllamav2Model:
         result.tokenizer = tokenizer
         result.generator = generator
         result.loras = None
+        result.draft_model = draft_model
+        result.draft_cache = draft_cache
         return result, result
 
     def encode(self, string, **kwargs):
@@ -179,6 +218,10 @@ class Exllamav2Model:
         else:
             max_new_tokens = state['max_new_tokens']
 
+        # Reset speculative decoding stats if using a draft model
+        if hasattr(self, 'draft_model') and self.draft_model is not None:
+            self.generator.reset_sd_stats()
+
         self.generator.begin_stream(ids, settings, loras=self.loras)
 
         decoded_text = ''
@@ -190,6 +233,11 @@ class Exllamav2Model:
             decoded_text += chunk
             yield decoded_text
 
+        # Log speculative decoding stats if using draft model
+        if hasattr(self, 'draft_model') and self.draft_model is not None:
+            efficiency, accuracy, total_tokens, total_draft_tokens, accepted_draft_tokens = self.generator.get_sd_stats()
+            logger.info(f"Speculative decoding: accepted={accepted_draft_tokens}/{total_draft_tokens} tokens")
+
     def generate(self, prompt, state):
         output = ''
         for output in self.generate_with_streaming(prompt, state):
diff --git a/modules/loaders.py b/modules/loaders.py
index 167b2c98..d256e1e7 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -25,7 +25,7 @@ loaders_and_params = OrderedDict({
         'gpu_layers_draft',
         'device_draft',
         'ctx_size_draft',
-        'speculative_decoding_accordion'
+        'speculative_decoding_accordion',
     ],
     'Transformers': [
         'gpu_split',
@@ -82,6 +82,9 @@ loaders_and_params = OrderedDict({
         'no_xformers',
         'no_sdpa',
         'exllamav2_info',
+        'model_draft',
+        'ctx_size_draft',
+        'speculative_decoding_accordion',
     ],
     'HQQ': [
         'hqq_backend',

From a90df27ff59038e5ac55cca5b2b5962e6b79d855 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Apr 2025 20:33:40 -0700
Subject: [PATCH 12/49] UI: Add a greeting when the chat history is empty

---
 css/main.css              |  9 +++++++++
 modules/html_generator.py | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 1545a74b..72adc071 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1299,3 +1299,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .dark .tgw-accordion {
     border: 1px solid var(--border-color-dark);
 }
+
+.welcome-greeting {
+    text-align: center;
+    margin-top: 35vh;
+    font-size: 24px;
+    opacity: 0.6;
+    padding-left: 1rem;
+    padding-right: 1rem;
+}
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 144f2593..5227e87e 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -1,3 +1,4 @@
+import datetime
 import functools
 import html
 import os
@@ -389,8 +390,21 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
     return output
 
 
+def time_greeting():
+    current_hour = datetime.datetime.now().hour
+    if 5 <= current_hour < 12:
+        return "Good morning!"
+    elif 12 <= current_hour < 18:
+        return "Good afternoon!"
+    else:
+        return "Good evening!"
+
+
 def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False):
-    if mode == 'instruct':
+    if len(history['visible']) == 0:
+        greeting = f"<div class=\"welcome-greeting\">{time_greeting()} How can I help you today?</div>"
+        result = f'<div class="chat" id="chat">{greeting}</div>'
+    elif mode == 'instruct':
         result = generate_instruct_html(history)
     elif style == 'wpp':
         result = generate_chat_html(history, name1, name2)

From 23399aff3cfad7cf8547ddde9bff4205e0193666 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 24 Apr 2025 20:39:00 -0700
Subject: [PATCH 13/49] UI: minor style change

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 72adc071..c1597600 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1304,7 +1304,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     text-align: center;
     margin-top: 35vh;
     font-size: 24px;
-    opacity: 0.6;
+    opacity: 0.7;
     padding-left: 1rem;
     padding-right: 1rem;
 }

From 5993ebeb1bec1a0e673815b335da3d652ee09c56 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Apr 2025 05:27:59 -0700
Subject: [PATCH 14/49] Bump exllamav2 to 0.2.9

---
 requirements/full/requirements.txt               | 6 +++---
 requirements/full/requirements_amd.txt           | 4 ++--
 requirements/full/requirements_amd_noavx2.txt    | 4 ++--
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_noavx2.txt        | 6 +++---
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 74b1089a..4a5cd36a 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_c
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c8e90190..98df8e8a 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -30,5 +30,5 @@ tiktoken
 
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index d994f6e2..9307d733 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -30,5 +30,5 @@ tiktoken
 
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 2c5b5780..ac9e17ac 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -32,4 +32,4 @@ tiktoken
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 01f45dcb..ce401ee5 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,4 +33,4 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_c
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 2152ff3e..6fee3a51 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_c
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From 2c7ff86015023c1ff7e5586ab9475573706e0680 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Apr 2025 05:28:22 -0700
Subject: [PATCH 15/49] Bump exllamav3 to
 https://github.com/turboderp-org/exllamav3/commit/de83084184c34cae0aaeca5c8b024de7f148830d

---
 requirements/full/requirements.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_noavx2.txt        | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 4a5cd36a..e30e5de7 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -32,8 +32,8 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ac9e17ac..414e1885 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -31,5 +31,5 @@ tiktoken
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index ce401ee5..5627b38e 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -32,5 +32,5 @@ tiktoken
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 6fee3a51..646aaaaf 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -32,8 +32,8 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"

From b6fffbd2164793b55196e4b3da6aae5668b62d94 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Apr 2025 05:37:44 -0700
Subject: [PATCH 16/49] UI: minor style change

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index c1597600..8ca5b33f 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1302,7 +1302,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 .welcome-greeting {
     text-align: center;
-    margin-top: 35vh;
+    margin-top: 40vh;
     font-size: 24px;
     opacity: 0.7;
     padding-left: 1rem;

From 98f4c694b9df06daf99e9cce14a35bf22328e46a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Apr 2025 07:32:51 -0700
Subject: [PATCH 17/49] llama.cpp: Add --extra-flags parameter for passing
 additional flags to llama-server

---
 modules/llama_cpp_server.py | 14 ++++++++++++++
 modules/loaders.py          |  1 +
 modules/shared.py           |  1 +
 modules/ui.py               |  1 +
 modules/ui_model_menu.py    |  1 +
 5 files changed, 18 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index ecc543f3..7199470d 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -301,6 +301,20 @@ class LlamaServer:
                 cmd += ["--device-draft", shared.args.device_draft]
             if shared.args.ctx_size_draft > 0:
                 cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
+        if shared.args.extra_flags:
+            # Clean up the input
+            extra_flags = shared.args.extra_flags.strip()
+            if extra_flags.startswith('"') and extra_flags.endswith('"'):
+                extra_flags = extra_flags[1:-1].strip()
+            elif extra_flags.startswith("'") and extra_flags.endswith("'"):
+                extra_flags = extra_flags[1:-1].strip()
+
+            for flag_item in extra_flags.split(';'):
+                if '=' in flag_item:
+                    flag, value = flag_item.split('=', 1)
+                    cmd += [f"--{flag}", value]
+                else:
+                    cmd.append(f"--{flag_item}")
 
         env = os.environ.copy()
         if os.name == 'posix':
diff --git a/modules/loaders.py b/modules/loaders.py
index d256e1e7..9442a147 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -12,6 +12,7 @@ loaders_and_params = OrderedDict({
         'n_ctx',
         'cache_type',
         'tensor_split',
+        'extra_flags',
         'rope_freq_base',
         'compress_pos_emb',
         'flash_attn',
diff --git a/modules/shared.py b/modules/shared.py
index 98ec50b2..b4f87082 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -128,6 +128,7 @@ group.add_argument('--tensor-split', type=str, default=None, help='Split the mod
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
 group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
+group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
 
 # Speculative decoding
 group = parser.add_argument_group('Speculative decoding')
diff --git a/modules/ui.py b/modules/ui.py
index 6fc5e955..19b76cee 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -114,6 +114,7 @@ def list_model_elements():
         'max_seq_len',
         'cache_type',
         'tensor_split',
+        'extra_flags',
         'gpu_split',
         'alpha_value',
         'rope_freq_base',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 56b6903f..f3319cfb 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -56,6 +56,7 @@ def create_ui():
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
+                            shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
                             shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
                             shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
                             shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')

From 0dd71e78c965289c5bfccb3cbc36183f04be23c6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Apr 2025 09:07:14 -0700
Subject: [PATCH 18/49] Add `-noavx2` portable builds

---
 .github/workflows/build-portable-release-cuda.yml   |  8 ++++++--
 .github/workflows/build-portable-release-vulkan.yml |  8 ++++++--
 .github/workflows/build-portable-release.yml        | 12 ++++++++----
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml
index b9647b16..fcc74408 100644
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@@ -59,7 +59,7 @@ jobs:
           $matrix = @{
               'os' = @('ubuntu-22.04', 'windows-2019')
               'pyver' = @("3.11")
-              'avx' = @("AVX2")
+              'avx' = @("AVX2", "AVX")
               'cuda' = @("11.7", "12.4")
           }
 
@@ -161,7 +161,11 @@ jobs:
             # 6. Create ZIP file
             cd ..
             VERSION_CLEAN="${VERSION#v}"
-            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
+            if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
+                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
+            else
+                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}-noavx2.zip"
+            fi
             echo "Creating archive: $ZIP_NAME"
 
             if [[ "$RUNNER_OS" == "Windows" ]]; then
diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml
index 287635a3..d5aa764c 100644
--- a/.github/workflows/build-portable-release-vulkan.yml
+++ b/.github/workflows/build-portable-release-vulkan.yml
@@ -59,7 +59,7 @@ jobs:
           $matrix = @{
               'os' = @('ubuntu-22.04', 'windows-2019')
               'pyver' = @("3.11")
-              'avx' = @("AVX2")
+              'avx' = @("AVX2", "AVX")
           }
 
           if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
@@ -146,7 +146,11 @@ jobs:
             # 6. Create ZIP file
             cd ..
             VERSION_CLEAN="${VERSION#v}"
-            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-vulkan.zip"
+            if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
+                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-vulkan.zip"
+            else
+                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-vulkan-noavx2.zip"
+            fi
             echo "Creating archive: $ZIP_NAME"
 
             if [[ "$RUNNER_OS" == "Windows" ]]; then
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index 6e041966..2424cc44 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -15,7 +15,7 @@ on:
         type: string
       exclude:
         description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
-        default: 'None'
+        default: 'os:macos-13,avx:AVX;os:macos-14,avx:AVX'
         required: false
         type: string
   workflow_call:
@@ -32,7 +32,7 @@ on:
         type: string
       exclude:
         description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
-        default: 'None'
+        default: 'os:macos-13,avx:AVX;os:macos-14,avx:AVX'
         required: false
         type: string
 
@@ -59,7 +59,7 @@ jobs:
           $matrix = @{
               'os' = @('ubuntu-22.04', 'windows-2019', 'macos-13', 'macos-14')
               'pyver' = @("3.11")
-              'avx' = @("AVX2")
+              'avx' = @("AVX2", "AVX")
           }
 
           if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
@@ -171,7 +171,11 @@ jobs:
             # 5. Create ZIP file
             cd ..
             VERSION_CLEAN="${VERSION#v}"
-            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip"
+            if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
+                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip"
+            else
+                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-noavx2.zip"
+            fi
             echo "Creating archive: $ZIP_NAME"
 
             if [[ "$RUNNER_OS" == "Windows" ]]; then

From d35818f4e1f51b86d07c7b3c92af9c96facb3e8a Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Fri, 25 Apr 2025 18:02:02 -0300
Subject: [PATCH 19/49] UI: Add a collapsible thinking block to messages with
 `<think>` steps (#6902)

---
 css/main.css              | 82 ++++++++++++++++++++++++++++++++--
 js/global_scope_js.js     | 76 ++++++++++++++++++++++++++++++--
 modules/chat.py           | 10 +----
 modules/html_generator.py | 93 ++++++++++++++++++++++++++++++++++++---
 modules/shared.py         |  1 -
 modules/ui.py             |  1 -
 modules/ui_parameters.py  |  1 -
 settings-template.yaml    |  1 -
 8 files changed, 238 insertions(+), 27 deletions(-)

diff --git a/css/main.css b/css/main.css
index 8ca5b33f..9dce4d0e 100644
--- a/css/main.css
+++ b/css/main.css
@@ -625,19 +625,19 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
         width: 100%;
         overflow-y: visible;
     }
-    
+
     .message {
         break-inside: avoid;
     }
-    
+
     .gradio-container {
         overflow: visible;
     }
-    
+
     .tab-nav {
         display: none !important;
     }
-    
+
     #chat-tab > :first-child {
         max-width: unset;
     }
@@ -1308,3 +1308,77 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding-left: 1rem;
     padding-right: 1rem;
 }
+
+/* Thinking blocks styling */
+.thinking-block {
+  margin-bottom: 12px;
+  border-radius: 8px;
+  border: 1px solid rgba(0, 0, 0, 0.1);
+  background-color: var(--light-theme-gray);
+  overflow: hidden;
+}
+
+.dark .thinking-block {
+  background-color: var(--darker-gray);
+}
+
+.thinking-header {
+  display: flex;
+  align-items: center;
+  padding: 10px 16px;
+  cursor: pointer;
+  user-select: none;
+  font-size: 14px;
+  color: rgba(0, 0, 0, 0.7);
+  transition: background-color 0.2s;
+}
+
+.thinking-header:hover {
+  background-color: rgba(0, 0, 0, 0.03);
+}
+
+.thinking-header::-webkit-details-marker {
+  display: none;
+}
+
+.thinking-icon {
+  margin-right: 8px;
+  color: rgba(0, 0, 0, 0.5);
+}
+
+.thinking-title {
+  font-weight: 500;
+}
+
+.thinking-content {
+  padding: 12px 16px;
+  border-top: 1px solid rgba(0, 0, 0, 0.07);
+  color: rgba(0, 0, 0, 0.7);
+  font-size: 14px;
+  line-height: 1.5;
+  overflow-wrap: break-word;
+  max-height: 300px;
+  overflow-y: scroll;
+  contain: layout;
+}
+
+/* Animation for opening thinking blocks */
+@keyframes fadeIn {
+  from { opacity: 0; }
+  to { opacity: 1; }
+}
+
+.thinking-block[open] .thinking-content {
+  animation: fadeIn 0.3s ease-out;
+}
+
+/* Additional style for in-progress thinking */
+.thinking-block[data-streaming="true"] .thinking-title {
+  animation: pulse 1.5s infinite;
+}
+
+@keyframes pulse {
+  0% { opacity: 0.6; }
+  50% { opacity: 1; }
+  100% { opacity: 0.6; }
+}
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index f308edb9..e808c473 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -31,24 +31,94 @@ function removeLastClick() {
 }
 
 function handleMorphdomUpdate(text) {
+  // Track closed blocks
+  const closedBlocks = new Set();
+  document.querySelectorAll(".thinking-block").forEach(block => {
+    const blockId = block.getAttribute("data-block-id");
+    // If block exists and is not open, add to closed set
+    if (blockId && !block.hasAttribute("open")) {
+      closedBlocks.add(blockId);
+    }
+  });
+
+  // Store scroll positions for any open blocks
+  const scrollPositions = {};
+  document.querySelectorAll(".thinking-block[open]").forEach(block => {
+    const content = block.querySelector(".thinking-content");
+    const blockId = block.getAttribute("data-block-id");
+    if (content && blockId) {
+      const isAtBottom = Math.abs((content.scrollHeight - content.scrollTop) - content.clientHeight) < 5;
+      scrollPositions[blockId] = {
+        position: content.scrollTop,
+        isAtBottom: isAtBottom
+      };
+    }
+  });
+
   morphdom(
     document.getElementById("chat").parentNode,
     "<div class=\"prose svelte-1ybaih5\">" + text + "</div>",
     {
       onBeforeElUpdated: function(fromEl, toEl) {
+        // Preserve code highlighting
         if (fromEl.tagName === "PRE" && fromEl.querySelector("code[data-highlighted]")) {
           const fromCode = fromEl.querySelector("code");
           const toCode = toEl.querySelector("code");
 
           if (fromCode && toCode && fromCode.textContent === toCode.textContent) {
-            // If the <code> content is the same, preserve the entire <pre> element
             toEl.className = fromEl.className;
             toEl.innerHTML = fromEl.innerHTML;
-            return false; // Skip updating the <pre> element
+            return false;
+          }
+        }
+
+        // For thinking blocks, respect closed state
+        if (fromEl.classList && fromEl.classList.contains("thinking-block") &&
+            toEl.classList && toEl.classList.contains("thinking-block")) {
+          const blockId = toEl.getAttribute("data-block-id");
+          // If this block was closed by user, keep it closed
+          if (blockId && closedBlocks.has(blockId)) {
+            toEl.removeAttribute("open");
+          }
+        }
+
+        return !fromEl.isEqualNode(toEl);
+      },
+
+      onElUpdated: function(el) {
+        // Restore scroll positions for open thinking blocks
+        if (el.classList && el.classList.contains("thinking-block") && el.hasAttribute("open")) {
+          const blockId = el.getAttribute("data-block-id");
+          const content = el.querySelector(".thinking-content");
+
+          if (content && blockId && scrollPositions[blockId]) {
+            setTimeout(() => {
+              if (scrollPositions[blockId].isAtBottom) {
+                content.scrollTop = content.scrollHeight;
+              } else {
+                content.scrollTop = scrollPositions[blockId].position;
+              }
+            }, 0);
           }
         }
-        return !fromEl.isEqualNode(toEl); // Update only if nodes differ
       }
     }
   );
+
+  // Add toggle listeners for new blocks
+  document.querySelectorAll(".thinking-block").forEach(block => {
+    if (!block._hasToggleListener) {
+      block.addEventListener("toggle", function(e) {
+        if (this.open) {
+          const content = this.querySelector(".thinking-content");
+          if (content) {
+            setTimeout(() => {
+              content.scrollTop = content.scrollHeight;
+            }, 0);
+          }
+        }
+      });
+      block._hasToggleListener = true;
+    }
+  });
 }
diff --git a/modules/chat.py b/modules/chat.py
index fd949907..94d90bdc 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -417,16 +417,8 @@ def generate_chat_reply(text, state, regenerate=False, _continue=False, loading_
             yield history
             return
 
-    show_after = html.escape(state.get("show_after")) if state.get("show_after") else None
     for history in chatbot_wrapper(text, state, regenerate=regenerate, _continue=_continue, loading_message=loading_message, for_ui=for_ui):
-        if show_after:
-            after = history["visible"][-1][1].partition(show_after)[2] or "*Is thinking...*"
-            yield {
-                'internal': history['internal'],
-                'visible': history['visible'][:-1] + [[history['visible'][-1][0], after]]
-            }
-        else:
-            yield history
+        yield history
 
 
 def character_is_loaded(state, raise_exception=False):
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 5227e87e..a72e4859 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -107,8 +107,87 @@ def replace_blockquote(m):
     return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
 
 
+def extract_thinking_block(string):
+    """Extract thinking blocks from the beginning of a string."""
+    if not string:
+        return None, string
+
+    THINK_START_TAG = "&lt;think&gt;"
+    THINK_END_TAG = "&lt;/think&gt;"
+
+    # Look for opening tag
+    start_pos = string.lstrip().find(THINK_START_TAG)
+    if start_pos == -1:
+        return None, string
+
+    # Adjust start position to account for any leading whitespace
+    start_pos = string.find(THINK_START_TAG)
+
+    # Find the content after the opening tag
+    content_start = start_pos + len(THINK_START_TAG)
+
+    # Look for closing tag
+    end_pos = string.find(THINK_END_TAG, content_start)
+
+    if end_pos != -1:
+        # Both tags found - extract content between them
+        thinking_content = string[content_start:end_pos]
+        remaining_content = string[end_pos + len(THINK_END_TAG):]
+        return thinking_content, remaining_content
+    else:
+        # Only opening tag found - everything else is thinking content
+        thinking_content = string[content_start:]
+        return thinking_content, ""
+
+
 @functools.lru_cache(maxsize=None)
-def convert_to_markdown(string):
+def convert_to_markdown(string, message_id=None):
+    if not string:
+        return ""
+
+    # Use a default message ID if none provided
+    if message_id is None:
+        message_id = "unknown"
+
+    # Extract thinking block if present
+    thinking_content, remaining_content = extract_thinking_block(string)
+
+    # Process the main content
+    html_output = process_markdown_content(remaining_content)
+
+    # If thinking content was found, process it using the same function
+    if thinking_content is not None:
+        thinking_html = process_markdown_content(thinking_content)
+
+        # Generate unique ID for the thinking block
+        block_id = f"thinking-{message_id}-0"
+
+        # Check if thinking is complete or still in progress
+        is_streaming = not remaining_content
+        title_text = "Thinking..." if is_streaming else "Thought"
+
+        thinking_block = f'''
+        <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}" open>
+            <summary class="thinking-header">
+                <svg class="thinking-icon" width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+                    <path d="M8 1.33334C4.31868 1.33334 1.33334 4.31868 1.33334 8.00001C1.33334 11.6813 4.31868 14.6667 8 14.6667C11.6813 14.6667 14.6667 11.6813 14.6667 8.00001C14.6667 4.31868 11.6813 1.33334 8 1.33334Z" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
+                    <path d="M8 10.6667V8.00001" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
+                    <path d="M8 5.33334H8.00667" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
+                </svg>
+                <span class="thinking-title">{title_text}</span>
+            </summary>
+            <div class="thinking-content pretty_scrollbar">{thinking_html}</div>
+        </details>
+        '''
+
+        # Prepend the thinking block to the message HTML
+        html_output = thinking_block + html_output
+
+    return html_output
+
+
+def process_markdown_content(string):
+    """Process a string through the markdown conversion pipeline."""
     if not string:
         return ""
 
@@ -209,15 +288,15 @@ def convert_to_markdown(string):
     return html_output
 
 
-def convert_to_markdown_wrapped(string, use_cache=True):
+def convert_to_markdown_wrapped(string, message_id=None, use_cache=True):
     '''
     Used to avoid caching convert_to_markdown calls during streaming.
     '''
 
     if use_cache:
-        return convert_to_markdown(string)
+        return convert_to_markdown(string, message_id=message_id)
 
-    return convert_to_markdown.__wrapped__(string)
+    return convert_to_markdown.__wrapped__(string, message_id=message_id)
 
 
 def generate_basic_html(string):
@@ -273,7 +352,7 @@ def generate_instruct_html(history):
     for i in range(len(history['visible'])):
         row_visible = history['visible'][i]
         row_internal = history['internal'][i]
-        converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+        converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
         if converted_visible[0]:  # Don't display empty user messages
             output += (
@@ -320,7 +399,7 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
     for i in range(len(history['visible'])):
         row_visible = history['visible'][i]
         row_internal = history['internal'][i]
-        converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+        converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
         if converted_visible[0]:  # Don't display empty user messages
             output += (
@@ -360,7 +439,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
     for i in range(len(history['visible'])):
         row_visible = history['visible'][i]
         row_internal = history['internal'][i]
-        converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+        converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
         if converted_visible[0]:  # Don't display empty user messages
             output += (
diff --git a/modules/shared.py b/modules/shared.py
index b4f87082..5177ac67 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -59,7 +59,6 @@ settings = {
     'seed': -1,
     'custom_stopping_strings': '',
     'custom_token_bans': '',
-    'show_after': '',
     'negative_prompt': '',
     'autoload_model': False,
     'dark_theme': True,
diff --git a/modules/ui.py b/modules/ui.py
index 19b76cee..3e1bf6d8 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -207,7 +207,6 @@ def list_interface_input_elements():
         'sampler_priority',
         'custom_stopping_strings',
         'custom_token_bans',
-        'show_after',
         'negative_prompt',
         'dry_sequence_breakers',
         'grammar_string',
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index c3245a9d..b494a758 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -93,7 +93,6 @@ def create_ui(default_preset):
                             shared.gradio['sampler_priority'] = gr.Textbox(value=generate_params['sampler_priority'], lines=12, label='Sampler priority', info='Parameter names separated by new lines or commas.', elem_classes=['add_scrollbar'])
                             shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=2, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
                             shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Token bans', info='Token IDs to ban, separated by commas. The IDs can be found in the Default or Notebook tab.')
-                            shared.gradio['show_after'] = gr.Textbox(value=shared.settings['show_after'] or None, label='Show after', info='Hide the reply before this text.', placeholder="</think>")
                             shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt', info='For CFG. Only used when guidance_scale is different than 1.', lines=3, elem_classes=['add_scrollbar'])
                             shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=generate_params['dry_sequence_breakers'], label='dry_sequence_breakers', info='Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.')
                             with gr.Row() as shared.gradio['grammar_file_row']:
diff --git a/settings-template.yaml b/settings-template.yaml
index 94a5c034..83764f97 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -29,7 +29,6 @@ truncation_length: 8192
 seed: -1
 custom_stopping_strings: ''
 custom_token_bans: ''
-show_after: ''
 negative_prompt: ''
 autoload_model: false
 dark_theme: true

From 877cf44c08dc98066118d1472844808b528fb778 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Apr 2025 16:21:35 -0700
Subject: [PATCH 20/49] llama.cpp: Add StreamingLLM (`--streaming-llm`)

---
 modules/llama_cpp_server.py | 2 ++
 modules/shared.py           | 1 +
 modules/ui_model_menu.py    | 1 +
 3 files changed, 4 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 7199470d..7e5e3a4b 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -301,6 +301,8 @@ class LlamaServer:
                 cmd += ["--device-draft", shared.args.device_draft]
             if shared.args.ctx_size_draft > 0:
                 cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
+        if shared.args.streaming_llm:
+            cmd += ["--cache-reuse", "1"]
         if shared.args.extra_flags:
             # Clean up the input
             extra_flags = shared.args.extra_flags.strip()
diff --git a/modules/shared.py b/modules/shared.py
index 5177ac67..c40f8e21 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -128,6 +128,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
 group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
+group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
 
 # Speculative decoding
 group = parser.add_argument_group('Speculative decoding')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index f3319cfb..1460dfec 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -71,6 +71,7 @@ def create_ui():
                             shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
                             shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
+                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
                             shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                             shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')

From faababc4ea5e4548bebc13b50509587343b4c2db Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Apr 2025 16:42:30 -0700
Subject: [PATCH 21/49] llama.cpp: Add a prompt processing progress bar

---
 modules/llama_cpp_server.py | 45 ++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 7e5e3a4b..85743705 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -1,6 +1,7 @@
 import json
 import os
 import pprint
+import re
 import socket
 import subprocess
 import sys
@@ -10,6 +11,7 @@ from pathlib import Path
 
 import llama_cpp_binaries
 import requests
+from tqdm import tqdm
 
 from modules import shared
 from modules.logging_colors import logger
@@ -335,17 +337,7 @@ class LlamaServer:
             env=env
         )
 
-        def filter_stderr(process_stderr):
-            try:
-                for line in iter(process_stderr.readline, ''):
-                    if not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line:
-                        sys.stderr.write(line)
-                        sys.stderr.flush()
-            except (ValueError, IOError):
-                # Handle pipe closed exceptions
-                pass
-
-        threading.Thread(target=filter_stderr, args=(self.process.stderr,), daemon=True).start()
+        threading.Thread(target=filter_stderr_with_progress, args=(self.process.stderr,), daemon=True).start()
 
         # Wait for server to be healthy
         health_url = f"http://127.0.0.1:{self.port}/health"
@@ -396,3 +388,34 @@ class LlamaServer:
                 self.process.kill()
 
             self.process = None
+
+
+def filter_stderr_with_progress(process_stderr):
+    progress_bar = None
+    progress_pattern = re.compile(r'slot update_slots: id.*progress = (\d+\.\d+)')
+
+    try:
+        for line in iter(process_stderr.readline, ''):
+            progress_match = progress_pattern.search(line)
+
+            if progress_match:
+                progress = float(progress_match.group(1))
+
+                # Create progress bar on first progress message
+                if progress_bar is None:
+                    progress_bar = tqdm(total=1.0, desc="Prompt Processing", leave=False)
+
+                progress_bar.update(progress - progress_bar.n)
+
+                # Clean up when complete
+                if progress >= 1.0:
+                    progress_bar.close()
+                    progress_bar = None
+
+            if not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line:
+                sys.stderr.write(line)
+                sys.stderr.flush()
+    except (ValueError, IOError):
+        if progress_bar:
+            progress_bar.close()
+        pass

From d4b1e31c493090424e3a38a7b115777dac943eac Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Apr 2025 16:59:03 -0700
Subject: [PATCH 22/49] Use `--ctx-size` to specify the context size for all
 loaders

Old flags are still recognized as alternatives.
---
 modules/exllamav2.py        |  4 ++--
 modules/exllamav2_hf.py     | 20 ++++++++++----------
 modules/exllamav3_hf.py     |  2 +-
 modules/llama_cpp_server.py |  2 +-
 modules/loaders.py          | 10 +++++-----
 modules/models.py           |  6 ++----
 modules/models_settings.py  |  7 +++----
 modules/shared.py           |  7 +++++--
 modules/tensorrt_llm.py     |  6 +++---
 modules/ui.py               |  3 +--
 modules/ui_model_menu.py    | 11 ++++-------
 modules/ui_parameters.py    |  6 ++----
 12 files changed, 39 insertions(+), 45 deletions(-)

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 7d79e516..952b73b8 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -3,6 +3,7 @@ import traceback
 from pathlib import Path
 
 import torch
+
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Cache,
@@ -15,7 +16,6 @@ from exllamav2 import (
     ExLlamaV2Tokenizer
 )
 from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
-
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
@@ -40,7 +40,7 @@ class Exllamav2Model:
         config.model_dir = str(path_to_model)
         config.prepare()
 
-        config.max_seq_len = shared.args.max_seq_len
+        config.max_seq_len = shared.args.ctx_size
         config.scale_pos_emb = shared.args.compress_pos_emb
         config.scale_alpha_value = shared.args.alpha_value
         config.no_flash_attn = shared.args.no_flash_attn
diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
index b159d9ce..d6c3bf6e 100644
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@@ -4,6 +4,15 @@ from pathlib import Path
 from typing import Any, Dict, Optional, Union
 
 import torch
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    GenerationConfig,
+    GenerationMixin,
+    PretrainedConfig,
+    PreTrainedModel
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Cache,
@@ -14,15 +23,6 @@ from exllamav2 import (
     ExLlamaV2Cache_TP,
     ExLlamaV2Config
 )
-from torch.nn import CrossEntropyLoss
-from transformers import (
-    GenerationConfig,
-    GenerationMixin,
-    PretrainedConfig,
-    PreTrainedModel
-)
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
 from modules import shared
 from modules.logging_colors import logger
 
@@ -192,7 +192,7 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin):
         config.model_dir = str(pretrained_model_name_or_path)
         config.prepare()
 
-        config.max_seq_len = shared.args.max_seq_len
+        config.max_seq_len = shared.args.ctx_size
         config.scale_pos_emb = shared.args.compress_pos_emb
         config.scale_alpha_value = shared.args.alpha_value
         config.no_flash_attn = shared.args.no_flash_attn
diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index 2d9c493a..24ba9e13 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -33,7 +33,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
         self.ex_model = Model.from_config(config)
 
         # Calculate the closest multiple of 256 at or above the chosen value
-        max_tokens = shared.args.max_seq_len
+        max_tokens = shared.args.ctx_size
         if max_tokens % 256 != 0:
             adjusted_tokens = ((max_tokens // 256) + 1) * 256
             logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}")
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 85743705..fb972a32 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -254,7 +254,7 @@ class LlamaServer:
         cmd = [
             self.server_path,
             "--model", self.model_path,
-            "--ctx-size", str(shared.args.n_ctx),
+            "--ctx-size", str(shared.args.ctx_size),
             "--n-gpu-layers", str(shared.args.n_gpu_layers),
             "--batch-size", str(shared.args.batch_size),
             "--port", str(self.port),
diff --git a/modules/loaders.py b/modules/loaders.py
index 9442a147..d8d62bf9 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -9,7 +9,7 @@ loaders_and_params = OrderedDict({
         'threads',
         'threads_batch',
         'batch_size',
-        'n_ctx',
+        'ctx_size',
         'cache_type',
         'tensor_split',
         'extra_flags',
@@ -48,14 +48,14 @@ loaders_and_params = OrderedDict({
         'no_use_fast',
     ],
     'ExLlamav3_HF': [
-        'max_seq_len',
+        'ctx_size',
         'gpu_split',
         'cfg_cache',
         'trust_remote_code',
         'no_use_fast',
     ],
     'ExLlamav2_HF': [
-        'max_seq_len',
+        'ctx_size',
         'cache_type',
         'gpu_split',
         'alpha_value',
@@ -71,7 +71,7 @@ loaders_and_params = OrderedDict({
         'no_use_fast',
     ],
     'ExLlamav2': [
-        'max_seq_len',
+        'ctx_size',
         'cache_type',
         'gpu_split',
         'alpha_value',
@@ -93,7 +93,7 @@ loaders_and_params = OrderedDict({
         'no_use_fast',
     ],
     'TensorRT-LLM': [
-        'max_seq_len',
+        'ctx_size',
         'cpp_runner',
         'tensorrt_llm_info',
     ]
diff --git a/modules/models.py b/modules/models.py
index 99b068aa..d0b0402a 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -52,10 +52,8 @@ def load_model(model_name, loader=None):
             tokenizer = load_tokenizer(model_name)
 
     shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
-    if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt'):
-        shared.settings['truncation_length'] = shared.args.max_seq_len
-    elif loader == 'llama.cpp':
-        shared.settings['truncation_length'] = shared.args.n_ctx
+    if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp':
+        shared.settings['truncation_length'] = shared.args.ctx_size
 
     logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
     logger.info(f"LOADER: \"{loader}\"")
diff --git a/modules/models_settings.py b/modules/models_settings.py
index ee2ed71b..d3ecd51f 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -11,8 +11,7 @@ def get_fallback_settings():
     return {
         'bf16': False,
         'use_eager_attention': False,
-        'max_seq_len': 2048,
-        'n_ctx': 2048,
+        'ctx_size': 2048,
         'rope_freq_base': 0,
         'compress_pos_emb': 1,
         'alpha_value': 1,
@@ -59,7 +58,7 @@ def get_model_metadata(model):
 
         for k in metadata:
             if k.endswith('context_length'):
-                model_settings['n_ctx'] = min(metadata[k], 8192)
+                model_settings['ctx_size'] = min(metadata[k], 8192)
                 model_settings['truncation_length_info'] = metadata[k]
             elif k.endswith('rope.freq_base'):
                 model_settings['rope_freq_base'] = metadata[k]
@@ -97,7 +96,7 @@ def get_model_metadata(model):
                 if k in metadata:
                     model_settings['truncation_length'] = metadata[k]
                     model_settings['truncation_length_info'] = metadata[k]
-                    model_settings['max_seq_len'] = min(metadata[k], 8192)
+                    model_settings['ctx_size'] = min(metadata[k], 8192)
 
             if 'rope_theta' in metadata:
                 model_settings['rope_freq_base'] = metadata['rope_theta']
diff --git a/modules/shared.py b/modules/shared.py
index c40f8e21..572bfc09 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -116,7 +116,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for
 # llama.cpp
 group = parser.add_argument_group('llama.cpp')
 group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.')
-group.add_argument('--n_ctx', type=int, default=8192, help='Size of the prompt context.')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
 group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
 group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
@@ -130,6 +129,11 @@ group.add_argument('--row-split', action='store_true', help='Split the model by
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
 group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
 
+# Cache
+group = parser.add_argument_group('Context and cache management')
+group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, help='Context size in tokens.')
+group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
+
 # Speculative decoding
 group = parser.add_argument_group('Speculative decoding')
 group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.')
@@ -142,7 +146,6 @@ group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the pr
 group = parser.add_argument_group('ExLlamaV2')
 group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
 group.add_argument('--autosplit', action='store_true', help='Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.')
-group.add_argument('--max_seq_len', type=int, default=8192, help='Maximum sequence length.')
 group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
 group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
 group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py
index c2685b75..73178c39 100644
--- a/modules/tensorrt_llm.py
+++ b/modules/tensorrt_llm.py
@@ -1,15 +1,15 @@
 from pathlib import Path
 
-import tensorrt_llm
 import torch
-from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
 
+import tensorrt_llm
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import (
     get_max_prompt_length,
     get_reply_from_output_ids
 )
+from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
 
 
 class TensorRTLLMModel:
@@ -35,7 +35,7 @@ class TensorRTLLMModel:
             logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"")
             runner_kwargs.update(
                 max_batch_size=1,
-                max_input_len=shared.args.max_seq_len - 512,
+                max_input_len=shared.args.ctx_size - 512,
                 max_output_len=512,
                 max_beam_width=1,
                 max_attention_window_size=None,
diff --git a/modules/ui.py b/modules/ui.py
index 3e1bf6d8..68cb76a6 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -110,8 +110,7 @@ def list_model_elements():
         'threads_batch',
         'batch_size',
         'hqq_backend',
-        'n_ctx',
-        'max_seq_len',
+        'ctx_size',
         'cache_type',
         'tensor_split',
         'extra_flags',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 1460dfec..9aeb02d1 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -51,8 +51,7 @@ def create_ui():
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-                            shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768.')
-                            shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768.')
+                            shared.gradio['ctx_size'] = gr.Number(label='ctx_size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
@@ -92,7 +91,7 @@ def create_ui():
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
                             shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
-                            shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
+                            shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
                             # Speculative decoding
                             with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
@@ -247,10 +246,8 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
 
 def update_truncation_length(current_length, state):
     if 'loader' in state:
-        if state['loader'].lower().startswith('exllama'):
-            return state['max_seq_len']
-        elif state['loader'] == 'llama.cpp':
-            return state['n_ctx']
+        if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp':
+            return state['ctx_size']
 
     return current_length
 
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index b494a758..156e4128 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -121,10 +121,8 @@ def create_event_handlers():
 
 
 def get_truncation_length():
-    if 'max_seq_len' in shared.provided_arguments or shared.args.max_seq_len != shared.args_defaults.max_seq_len:
-        return shared.args.max_seq_len
-    elif 'n_ctx' in shared.provided_arguments or shared.args.n_ctx != shared.args_defaults.n_ctx:
-        return shared.args.n_ctx
+    if 'ctx_size' in shared.provided_arguments or shared.args.ctx_size != shared.args_defaults.ctx_size:
+        return shared.args.ctx_size
     else:
         return shared.settings['truncation_length']
 

From d4017fbb6d0b9be7a8964ad3fa03db0b373e453d Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Fri, 25 Apr 2025 21:32:00 -0300
Subject: [PATCH 23/49] ExLlamaV3: Add kv cache quantization (#6903)

---
 modules/exllamav3_hf.py  | 29 ++++++++++++++++++++++++++++-
 modules/loaders.py       |  2 ++
 modules/shared.py        |  2 +-
 modules/ui_model_menu.py |  2 +-
 4 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index 24ba9e13..f15fc0b2 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Union
 
 import torch
 from exllamav3 import Cache, Config, Model
+from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
 from torch.nn import CrossEntropyLoss
 from transformers import (
     GenerationConfig,
@@ -39,7 +40,33 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
             logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}")
             max_tokens = adjusted_tokens
 
-        self.ex_cache = Cache(self.ex_model, max_num_tokens=max_tokens)
+        # Parse cache type
+        cache_type = shared.args.cache_type.lower()
+        cache_kwargs = {}
+        if cache_type == 'fp16':
+            layer_type = CacheLayer_fp16
+        elif cache_type.startswith('q'):
+            layer_type = CacheLayer_quant
+            if '_' in cache_type:
+                # Different bits for k and v (e.g., q4_q8)
+                k_part, v_part = cache_type.split('_')
+                k_bits = int(k_part[1:])
+                v_bits = int(v_part[1:])
+            else:
+                # Same bits for k and v (e.g., q4)
+                k_bits = v_bits = int(cache_type[1:])
+
+            # Validate bit ranges
+            if not (2 <= k_bits <= 8 and 2 <= v_bits <= 8):
+                logger.warning(f"Invalid quantization bits: k_bits={k_bits}, v_bits={v_bits}. Must be between 2 and 8. Falling back to fp16.")
+                layer_type = CacheLayer_fp16
+            else:
+                cache_kwargs = {'k_bits': k_bits, 'v_bits': v_bits}
+        else:
+            logger.warning(f"Unrecognized cache type: {cache_type}. Falling back to fp16.")
+            layer_type = CacheLayer_fp16
+
+        self.ex_cache = Cache(self.ex_model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs)
 
         # Create load parameters dictionary
         load_params = {'progressbar': True}
diff --git a/modules/loaders.py b/modules/loaders.py
index d8d62bf9..062e4837 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -13,6 +13,7 @@ loaders_and_params = OrderedDict({
         'cache_type',
         'tensor_split',
         'extra_flags',
+        'streaming_llm',
         'rope_freq_base',
         'compress_pos_emb',
         'flash_attn',
@@ -49,6 +50,7 @@ loaders_and_params = OrderedDict({
     ],
     'ExLlamav3_HF': [
         'ctx_size',
+        'cache_type',
         'gpu_split',
         'cfg_cache',
         'trust_remote_code',
diff --git a/modules/shared.py b/modules/shared.py
index 572bfc09..96f65929 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -132,7 +132,7 @@ group.add_argument('--streaming-llm', action='store_true', help='Activate Stream
 # Cache
 group = parser.add_argument_group('Context and cache management')
 group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, help='Context size in tokens.')
-group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
+group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
 
 # Speculative decoding
 group = parser.add_argument_group('Speculative decoding')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 9aeb02d1..6bd647c6 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -52,7 +52,7 @@ def create_ui():
                             shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
                             shared.gradio['ctx_size'] = gr.Number(label='ctx_size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
-                            shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
+                            shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')

From d9de14d1f7e81d6e18711431acb9579dbb8a5ddf Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Sat, 26 Apr 2025 08:56:54 -0300
Subject: [PATCH 24/49] Restructure the repository (#6904)

---
 .gitignore                                    |  20 +-
 README.md                                     | 239 +++++++++---------
 download-model.py                             |   6 +-
 extensions/Training_PRO/matplotgraph.py       |   2 +-
 extensions/Training_PRO/script.py             |  48 ++--
 extensions/Training_PRO/train_utils.py        |  12 +-
 extensions/gallery/script.py                  |   4 +-
 extensions/openai/typing.py                   |   6 +-
 js/main.js                                    |   2 +-
 js/update_big_picture.js                      |   2 +-
 modules/chat.py                               |  40 +--
 modules/evaluate.py                           |   8 +-
 modules/html_generator.py                     |   8 +-
 modules/models_settings.py                    |   6 +-
 modules/presets.py                            |   2 +-
 modules/prompts.py                            |   2 +-
 modules/shared.py                             |   8 +-
 modules/training.py                           |  34 +--
 modules/transformers_loader.py                |   2 +-
 modules/ui.py                                 |   2 +-
 modules/ui_chat.py                            |   2 +-
 modules/ui_default.py                         |   4 +-
 modules/ui_file_saving.py                     |  12 +-
 modules/ui_model_menu.py                      |   4 +-
 modules/ui_parameters.py                      |   2 +-
 modules/ui_session.py                         |   2 +-
 modules/utils.py                              |  12 +-
 one_click.py                                  |   6 +-
 server.py                                     |  18 +-
 CMD_FLAGS.txt => user_data/CMD_FLAGS.txt      |   0
 .../characters}/Assistant.yaml                |   0
 .../characters}/Example.png                   | Bin
 .../characters}/Example.yaml                  |   0
 .../grammars}/arithmetic.gbnf                 |   0
 {grammars => user_data/grammars}/c.gbnf       |   0
 {grammars => user_data/grammars}/chess.gbnf   |   0
 {grammars => user_data/grammars}/json.gbnf    |   0
 .../grammars}/json_w_trailing_space.gbnf      |   0
 {grammars => user_data/grammars}/list.gbnf    |   0
 .../grammars}/roleplay.gbnf                   |   0
 .../grammars}/simple_arithmetic.gbnf          |   0
 .../Airoboros-v1.2.yaml                       |   0
 .../instruction-templates}/Alpaca.yaml        |   0
 .../instruction-templates}/Bactrian.yaml      |   0
 .../instruction-templates}/Baichuan Chat.yaml |   0
 .../instruction-templates}/Baize.yaml         |   0
 .../instruction-templates}/Bluemoon.yaml      |   0
 .../instruction-templates}/ChatGLM.yaml       |   0
 .../instruction-templates}/ChatML.yaml        |   0
 .../Chinese-Vicuna-Chat.yaml                  |   0
 .../instruction-templates}/Command-R.yaml     |   0
 .../Galactica Cite.yaml                       |   0
 .../Galactica Finetuned.yaml                  |   0
 .../instruction-templates}/Galactica Q.yaml   |   0
 .../Galactica Summary.yaml                    |   0
 .../Galactica Work.yaml                       |   0
 .../instruction-templates}/Galactica v2.yaml  |   0
 .../instruction-templates}/Galactica.yaml     |   0
 .../instruction-templates}/Gorilla.yaml       |   0
 .../Guanaco non-chat.yaml                     |   0
 .../instruction-templates}/Guanaco-QLoRA.yaml |   0
 .../H2O-prompt_answer.yaml                    |   0
 .../instruction-templates}/Hippogriff.yaml    |   0
 .../instruction-templates}/INCITE-Chat.yaml   |   0
 .../INCITE-Instruct.yaml                      |   0
 .../instruction-templates}/KoAlpaca.yaml      |   0
 .../instruction-templates}/Koala.yaml         |   0
 .../instruction-templates}/LLaVA.yaml         |   0
 .../instruction-templates}/Llama-v2.yaml      |   0
 .../instruction-templates}/Llama-v3.yaml      |   0
 .../instruction-templates}/MOSS.yaml          |   0
 .../Manticore Chat.yaml                       |   0
 .../instruction-templates}/Metharme.yaml      |   0
 .../instruction-templates}/Mistral.yaml       |   0
 .../instruction-templates}/NVIDIA-ChatQA.yaml |   0
 .../instruction-templates}/NewHope.yaml       |   0
 .../Open Assistant.yaml                       |   0
 .../instruction-templates}/OpenBuddy.yaml     |   0
 .../instruction-templates}/OpenChat.yaml      |   0
 .../OpenOrca-Platypus2.yaml                   |   0
 .../instruction-templates}/Orca Mini.yaml     |   0
 .../instruction-templates}/Orca-Vicuna.yaml   |   0
 .../instruction-templates}/RWKV-Raven.yaml    |   0
 .../instruction-templates}/RWKV-World.yaml    |   0
 .../instruction-templates}/Samantha.yaml      |   0
 .../instruction-templates}/StableBeluga2.yaml |   0
 .../instruction-templates}/StableLM.yaml      |   0
 .../instruction-templates}/StableVicuna.yaml  |   0
 .../instruction-templates}/Starchat-Beta.yaml |   0
 .../instruction-templates}/Synthia-CoT.yaml   |   0
 .../instruction-templates}/Synthia.yaml       |   0
 .../instruction-templates}/Tulu.yaml          |   0
 .../instruction-templates}/Vicuna-v0.yaml     |   0
 .../instruction-templates}/Vicuna-v1.1.yaml   |   0
 .../instruction-templates}/Vigogne-Chat.yaml  |   0
 .../Vigogne-Instruct.yaml                     |   0
 .../Wizard-Mega ShareGPT.yaml                 |   0
 .../instruction-templates}/Wizard-Mega.yaml   |   0
 .../instruction-templates}/Ziya.yaml          |   0
 .../loras}/place-your-loras-here.txt          |   0
 {models => user_data/models}/config.yaml      |   0
 .../models}/place-your-models-here.txt        |   0
 .../presets}/Contrastive Search.yaml          |   0
 {presets => user_data/presets}/Creative.yaml  |   0
 .../presets}/Deterministic.yaml               |   0
 {presets => user_data/presets}/Instruct.yaml  |   0
 .../presets}/Null preset.yaml                 |   0
 {presets => user_data/presets}/min_p.yaml     |   0
 .../prompts}/Alpaca-with-Input.txt            |   0
 {prompts => user_data/prompts}/QA.txt         |   0
 .../datasets/put-trainer-datasets-here.txt    |   0
 .../training}/formats/ChatML-format.json      |   0
 .../formats/alpaca-chatbot-format.json        |   0
 .../training}/formats/alpaca-format.json      |   0
 .../training}/formats/llama2-chat-format.json |   0
 .../training}/formats/vicuna-format.json      |   0
 116 files changed, 254 insertions(+), 261 deletions(-)
 rename CMD_FLAGS.txt => user_data/CMD_FLAGS.txt (100%)
 rename {characters => user_data/characters}/Assistant.yaml (100%)
 rename {characters => user_data/characters}/Example.png (100%)
 rename {characters => user_data/characters}/Example.yaml (100%)
 rename {grammars => user_data/grammars}/arithmetic.gbnf (100%)
 rename {grammars => user_data/grammars}/c.gbnf (100%)
 rename {grammars => user_data/grammars}/chess.gbnf (100%)
 rename {grammars => user_data/grammars}/json.gbnf (100%)
 rename {grammars => user_data/grammars}/json_w_trailing_space.gbnf (100%)
 rename {grammars => user_data/grammars}/list.gbnf (100%)
 rename {grammars => user_data/grammars}/roleplay.gbnf (100%)
 rename {grammars => user_data/grammars}/simple_arithmetic.gbnf (100%)
 rename {instruction-templates => user_data/instruction-templates}/Airoboros-v1.2.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Alpaca.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Bactrian.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Baichuan Chat.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Baize.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Bluemoon.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/ChatGLM.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/ChatML.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Chinese-Vicuna-Chat.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Command-R.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Galactica Cite.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Galactica Finetuned.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Galactica Q.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Galactica Summary.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Galactica Work.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Galactica v2.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Galactica.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Gorilla.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Guanaco non-chat.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Guanaco-QLoRA.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/H2O-prompt_answer.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Hippogriff.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/INCITE-Chat.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/INCITE-Instruct.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/KoAlpaca.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Koala.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/LLaVA.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Llama-v2.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Llama-v3.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/MOSS.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Manticore Chat.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Metharme.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Mistral.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/NVIDIA-ChatQA.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/NewHope.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Open Assistant.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/OpenBuddy.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/OpenChat.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/OpenOrca-Platypus2.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Orca Mini.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Orca-Vicuna.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/RWKV-Raven.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/RWKV-World.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Samantha.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/StableBeluga2.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/StableLM.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/StableVicuna.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Starchat-Beta.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Synthia-CoT.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Synthia.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Tulu.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Vicuna-v0.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Vicuna-v1.1.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Vigogne-Chat.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Vigogne-Instruct.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Wizard-Mega ShareGPT.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Wizard-Mega.yaml (100%)
 rename {instruction-templates => user_data/instruction-templates}/Ziya.yaml (100%)
 rename {loras => user_data/loras}/place-your-loras-here.txt (100%)
 rename {models => user_data/models}/config.yaml (100%)
 rename {models => user_data/models}/place-your-models-here.txt (100%)
 rename {presets => user_data/presets}/Contrastive Search.yaml (100%)
 rename {presets => user_data/presets}/Creative.yaml (100%)
 rename {presets => user_data/presets}/Deterministic.yaml (100%)
 rename {presets => user_data/presets}/Instruct.yaml (100%)
 rename {presets => user_data/presets}/Null preset.yaml (100%)
 rename {presets => user_data/presets}/min_p.yaml (100%)
 rename {prompts => user_data/prompts}/Alpaca-with-Input.txt (100%)
 rename {prompts => user_data/prompts}/QA.txt (100%)
 rename {training => user_data/training}/datasets/put-trainer-datasets-here.txt (100%)
 rename {training => user_data/training}/formats/ChatML-format.json (100%)
 rename {training => user_data/training}/formats/alpaca-chatbot-format.json (100%)
 rename {training => user_data/training}/formats/alpaca-format.json (100%)
 rename {training => user_data/training}/formats/llama2-chat-format.json (100%)
 rename {training => user_data/training}/formats/vicuna-format.json (100%)

diff --git a/.gitignore b/.gitignore
index 318e147d..bd69c941 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,26 +1,8 @@
-/cache
-/characters
 /css
 /extensions
-/grammars
 /installer_files
-/logs
-/loras
-/models
-/presets
-/prompts
 /repositories
-/softprompts
-/torch-dumps
-/training/datasets
-
-/CMD_FLAGS.txt
-/img_bot*
-/img_me*
-/models/config-user.yaml
-/notification.mp3
-/settings*.json
-/settings*.yaml
+/user_data
 
 .chroma
 .DS_Store
diff --git a/README.md b/README.md
index f62e3508..58f77786 100644
--- a/README.md
+++ b/README.md
@@ -182,131 +182,140 @@ List of command-line flags
 </summary>
 
 ```txt
-usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--settings SETTINGS]
-                 [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices] [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]]
-                 [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast]
-                 [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn]
-                 [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT]
-                 [--numa] [--no-kv-offload] [--row-split] [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa]
-                 [--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner] [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
-                 [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
-                 [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
-                 [--subpath SUBPATH] [--old-colors] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6]
-                 [--api-disable-ipv4] [--nowebui]
+usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
+                 [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
+                 [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
+                 [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
+                 [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size CTX_SIZE]
+                 [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
+                 [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner]
+                 [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
+                 [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
+                 [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api]
+                 [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
 
 Text generation web UI
 
 options:
-  -h, --help                                     show this help message and exit
+  -h, --help                                           show this help message and exit
 
 Basic settings:
-  --multi-user                                   Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.
-  --character CHARACTER                          The name of the character to load in chat mode by default.
-  --model MODEL                                  Name of the model to load by default.
-  --lora LORA [LORA ...]                         The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
-  --model-dir MODEL_DIR                          Path to directory with all the models.
-  --lora-dir LORA_DIR                            Path to directory with all the loras.
-  --settings SETTINGS                            Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this
-                                                 file will be loaded by default without the need to use the --settings flag.
-  --extensions EXTENSIONS [EXTENSIONS ...]       The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.
-  --verbose                                      Print the prompts to the terminal.
-  --idle-timeout IDLE_TIMEOUT                    Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
+  --multi-user                                         Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.
+  --character CHARACTER                                The name of the character to load in chat mode by default.
+  --model MODEL                                        Name of the model to load by default.
+  --lora LORA [LORA ...]                               The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
+  --model-dir MODEL_DIR                                Path to directory with all the models.
+  --lora-dir LORA_DIR                                  Path to directory with all the loras.
+  --model-menu                                         Show a model menu in the terminal when the web UI is first launched.
+  --settings SETTINGS                                  Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml,
+                                                       this file will be loaded by default without the need to use the --settings flag.
+  --extensions EXTENSIONS [EXTENSIONS ...]             The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.
+  --verbose                                            Print the prompts to the terminal.
+  --idle-timeout IDLE_TIMEOUT                          Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
 
 Model loader:
-  --loader LOADER                                Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
-                                                 HQQ, TensorRT-LLM.
+  --loader LOADER                                      Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF,
+                                                       ExLlamav2, HQQ, TensorRT-LLM.
 
 Transformers/Accelerate:
-  --cpu                                          Use the CPU to generate text. Warning: Training on CPU is extremely slow.
-  --auto-devices                                 Automatically split the model across the available GPU(s) and CPU.
-  --gpu-memory GPU_MEMORY [GPU_MEMORY ...]       Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values
-                                                 in MiB like --gpu-memory 3500MiB.
-  --cpu-memory CPU_MEMORY                        Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.
-  --disk                                         If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.
-  --disk-cache-dir DISK_CACHE_DIR                Directory to save the disk cache to. Defaults to "cache".
-  --load-in-8bit                                 Load the model with 8-bit precision (using bitsandbytes).
-  --bf16                                         Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
-  --no-cache                                     Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.
-  --trust-remote-code                            Set trust_remote_code=True while loading the model. Necessary for some models.
-  --force-safetensors                            Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
-  --no_use_fast                                  Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
-  --use_flash_attention_2                        Set use_flash_attention_2=True while loading the model.
-  --use_eager_attention                          Set attn_implementation= eager while loading the model.
-  --torch-compile                                Compile the model with torch.compile for improved performance.
+  --cpu                                                Use the CPU to generate text. Warning: Training on CPU is extremely slow.
+  --cpu-memory CPU_MEMORY                              Maximum CPU memory in GiB. Use this for CPU offloading.
+  --disk                                               If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.
+  --disk-cache-dir DISK_CACHE_DIR                      Directory to save the disk cache to. Defaults to "user_data/cache".
+  --load-in-8bit                                       Load the model with 8-bit precision (using bitsandbytes).
+  --bf16                                               Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
+  --no-cache                                           Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.
+  --trust-remote-code                                  Set trust_remote_code=True while loading the model. Necessary for some models.
+  --force-safetensors                                  Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
+  --no_use_fast                                        Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
+  --use_flash_attention_2                              Set use_flash_attention_2=True while loading the model.
+  --use_eager_attention                                Set attn_implementation= eager while loading the model.
+  --torch-compile                                      Compile the model with torch.compile for improved performance.
 
 bitsandbytes 4-bit:
-  --load-in-4bit                                 Load the model with 4-bit precision (using bitsandbytes).
-  --use_double_quant                             use_double_quant for 4-bit.
-  --compute_dtype COMPUTE_DTYPE                  compute dtype for 4-bit. Valid options: bfloat16, float16, float32.
-  --quant_type QUANT_TYPE                        quant_type for 4-bit. Valid options: nf4, fp4.
+  --load-in-4bit                                       Load the model with 4-bit precision (using bitsandbytes).
+  --use_double_quant                                   use_double_quant for 4-bit.
+  --compute_dtype COMPUTE_DTYPE                        compute dtype for 4-bit. Valid options: bfloat16, float16, float32.
+  --quant_type QUANT_TYPE                              quant_type for 4-bit. Valid options: nf4, fp4.
 
 llama.cpp:
-  --flash-attn                                   Use flash-attention.
-  --n_ctx N_CTX                                  Size of the prompt context.
-  --threads THREADS                              Number of threads to use.
-  --threads-batch THREADS_BATCH                  Number of threads to use for batches/prompt processing.
-  --batch-size BATCH_SIZE                        Maximum number of prompt tokens to batch together when calling llama_eval.
-  --no-mmap                                      Prevent mmap from being used.
-  --mlock                                        Force the system to keep the model in RAM.
-  --n-gpu-layers N_GPU_LAYERS                    Number of layers to offload to the GPU.
-  --tensor-split TENSOR_SPLIT                    Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
-  --numa                                         Activate NUMA task allocation for llama.cpp.
-  --no-kv-offload                                Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
-  --row-split                                    Split the model by rows across GPUs. This may improve multi-gpu performance.
+  --flash-attn                                         Use flash-attention.
+  --threads THREADS                                    Number of threads to use.
+  --threads-batch THREADS_BATCH                        Number of threads to use for batches/prompt processing.
+  --batch-size BATCH_SIZE                              Maximum number of prompt tokens to batch together when calling llama_eval.
+  --no-mmap                                            Prevent mmap from being used.
+  --mlock                                              Force the system to keep the model in RAM.
+  --n-gpu-layers N_GPU_LAYERS                          Number of layers to offload to the GPU.
+  --tensor-split TENSOR_SPLIT                          Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
+  --numa                                               Activate NUMA task allocation for llama.cpp.
+  --no-kv-offload                                      Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
+  --row-split                                          Split the model by rows across GPUs. This may improve multi-gpu performance.
+  --extra-flags EXTRA_FLAGS                            Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"
+  --streaming-llm                                      Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
+
+Context and cache management:
+  --ctx-size CTX_SIZE, --n_ctx CTX_SIZE, --max_seq_len CTX_SIZE
+                                                       Context size in tokens.
+
+Speculative decoding:
+  --model-draft MODEL_DRAFT                            Path to the draft model for speculative decoding.
+  --draft-max DRAFT_MAX                                Number of tokens to draft for speculative decoding.
+  --gpu-layers-draft GPU_LAYERS_DRAFT                  Number of layers to offload to the GPU for the draft model.
+  --device-draft DEVICE_DRAFT                          Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1
+  --ctx-size-draft CTX_SIZE_DRAFT                      Size of the prompt context for the draft model. If 0, uses the same as the main model.
 
 ExLlamaV2:
-  --gpu-split GPU_SPLIT                          Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
-  --autosplit                                    Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.
-  --max_seq_len MAX_SEQ_LEN                      Maximum sequence length.
-  --cfg-cache                                    ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
-  --no_flash_attn                                Force flash-attention to not be used.
-  --no_xformers                                  Force xformers to not be used.
-  --no_sdpa                                      Force Torch SDPA to not be used.
-  --num_experts_per_token NUM_EXPERTS_PER_TOKEN  Number of experts to use for generation. Applies to MoE models like Mixtral.
-  --enable_tp                                    Enable Tensor Parallelism (TP) in ExLlamaV2.
+  --gpu-split GPU_SPLIT                                Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
+  --autosplit                                          Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.
+  --cfg-cache                                          ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
+  --no_flash_attn                                      Force flash-attention to not be used.
+  --no_xformers                                        Force xformers to not be used.
+  --no_sdpa                                            Force Torch SDPA to not be used.
+  --num_experts_per_token NUM_EXPERTS_PER_TOKEN        Number of experts to use for generation. Applies to MoE models like Mixtral.
+  --enable_tp                                          Enable Tensor Parallelism (TP) in ExLlamaV2.
 
 HQQ:
-  --hqq-backend HQQ_BACKEND                      Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
+  --hqq-backend HQQ_BACKEND                            Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
 
 TensorRT-LLM:
-  --cpp-runner                                   Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
+  --cpp-runner                                         Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
 
 Cache:
-  --cache_type CACHE_TYPE                        KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
+  --cache_type CACHE_TYPE                              KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
 
 DeepSpeed:
-  --deepspeed                                    Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
-  --nvme-offload-dir NVME_OFFLOAD_DIR            DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
-  --local_rank LOCAL_RANK                        DeepSpeed: Optional argument for distributed setups.
+  --deepspeed                                          Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
+  --nvme-offload-dir NVME_OFFLOAD_DIR                  DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
+  --local_rank LOCAL_RANK                              DeepSpeed: Optional argument for distributed setups.
 
 RoPE:
-  --alpha_value ALPHA_VALUE                      Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
-  --rope_freq_base ROPE_FREQ_BASE                If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
-  --compress_pos_emb COMPRESS_POS_EMB            Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
+  --alpha_value ALPHA_VALUE                            Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
+  --rope_freq_base ROPE_FREQ_BASE                      If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
+  --compress_pos_emb COMPRESS_POS_EMB                  Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
 
 Gradio:
-  --listen                                       Make the web UI reachable from your local network.
-  --listen-port LISTEN_PORT                      The listening port that the server will use.
-  --listen-host LISTEN_HOST                      The hostname that the server will use.
-  --share                                        Create a public URL. This is useful for running the web UI on Google Colab or similar.
-  --auto-launch                                  Open the web UI in the default browser upon launch.
-  --gradio-auth GRADIO_AUTH                      Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".
-  --gradio-auth-path GRADIO_AUTH_PATH            Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.
-  --ssl-keyfile SSL_KEYFILE                      The path to the SSL certificate key file.
-  --ssl-certfile SSL_CERTFILE                    The path to the SSL certificate cert file.
-  --subpath SUBPATH                              Customize the subpath for gradio, use with reverse proxy
-  --old-colors                                   Use the legacy Gradio colors, before the December/2024 update.
+  --listen                                             Make the web UI reachable from your local network.
+  --listen-port LISTEN_PORT                            The listening port that the server will use.
+  --listen-host LISTEN_HOST                            The hostname that the server will use.
+  --share                                              Create a public URL. This is useful for running the web UI on Google Colab or similar.
+  --auto-launch                                        Open the web UI in the default browser upon launch.
+  --gradio-auth GRADIO_AUTH                            Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".
+  --gradio-auth-path GRADIO_AUTH_PATH                  Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.
+  --ssl-keyfile SSL_KEYFILE                            The path to the SSL certificate key file.
+  --ssl-certfile SSL_CERTFILE                          The path to the SSL certificate cert file.
+  --subpath SUBPATH                                    Customize the subpath for gradio, use with reverse proxy
+  --old-colors                                         Use the legacy Gradio colors, before the December/2024 update.
 
 API:
-  --api                                          Enable the API extension.
-  --public-api                                   Create a public URL for the API using Cloudfare.
-  --public-api-id PUBLIC_API_ID                  Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.
-  --api-port API_PORT                            The listening port for the API.
-  --api-key API_KEY                              API authentication key.
-  --admin-key ADMIN_KEY                          API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.
-  --api-enable-ipv6                              Enable IPv6 for the API
-  --api-disable-ipv4                             Disable IPv4 for the API
-  --nowebui                                      Do not launch the Gradio UI. Useful for launching the API in standalone mode.
+  --api                                                Enable the API extension.
+  --public-api                                         Create a public URL for the API using Cloudfare.
+  --public-api-id PUBLIC_API_ID                        Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.
+  --api-port API_PORT                                  The listening port for the API.
+  --api-key API_KEY                                    API authentication key.
+  --admin-key ADMIN_KEY                                API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.
+  --api-enable-ipv6                                    Enable IPv6 for the API
+  --api-disable-ipv4                                   Disable IPv4 for the API
+  --nowebui                                            Do not launch the Gradio UI. Useful for launching the API in standalone mode.
 ```
 
 </details>
@@ -317,35 +326,37 @@ https://github.com/oobabooga/text-generation-webui/wiki
 
 ## Downloading models
 
-Models should be placed in the folder `text-generation-webui/models`. They are usually downloaded from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads).
+Models should be placed in the folder `text-generation-webui/user_data/models`. They are usually downloaded from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads).
 
-* GGUF models are a single file and should be placed directly into `models`. Example:
+* GGUF models are a single file and should be placed directly into `user_data/models`. Example:
 
 ```
 text-generation-webui
-└── models
-    └── llama-2-13b-chat.Q4_K_M.gguf
+└── user_data
+    └── models
+        └── llama-2-13b-chat.Q4_K_M.gguf
 ```
 
 * The remaining model types (like 16-bit Transformers models and EXL2 models) are made of several files and must be placed in a subfolder. Example:
 
 ```
 text-generation-webui
-├── models
-│   ├── lmsys_vicuna-33b-v1.3
-│   │   ├── config.json
-│   │   ├── generation_config.json
-│   │   ├── pytorch_model-00001-of-00007.bin
-│   │   ├── pytorch_model-00002-of-00007.bin
-│   │   ├── pytorch_model-00003-of-00007.bin
-│   │   ├── pytorch_model-00004-of-00007.bin
-│   │   ├── pytorch_model-00005-of-00007.bin
-│   │   ├── pytorch_model-00006-of-00007.bin
-│   │   ├── pytorch_model-00007-of-00007.bin
-│   │   ├── pytorch_model.bin.index.json
-│   │   ├── special_tokens_map.json
-│   │   ├── tokenizer_config.json
-│   │   └── tokenizer.model
+└── user_data
+    └── models
+        └── lmsys_vicuna-33b-v1.3
+            ├── config.json
+            ├── generation_config.json
+            ├── pytorch_model-00001-of-00007.bin
+            ├── pytorch_model-00002-of-00007.bin
+            ├── pytorch_model-00003-of-00007.bin
+            ├── pytorch_model-00004-of-00007.bin
+            ├── pytorch_model-00005-of-00007.bin
+            ├── pytorch_model-00006-of-00007.bin
+            ├── pytorch_model-00007-of-00007.bin
+            ├── pytorch_model.bin.index.json
+            ├── special_tokens_map.json
+            ├── tokenizer_config.json
+            └── tokenizer.model
 ```
 
 In both cases, you can use the "Model" tab of the UI to download the model from Hugging Face automatically. It is also possible to download it via the command-line with:
diff --git a/download-model.py b/download-model.py
index 8ff1d69c..25517491 100644
--- a/download-model.py
+++ b/download-model.py
@@ -1,5 +1,5 @@
 '''
-Downloads models from Hugging Face to models/username_modelname.
+Downloads models from Hugging Face to user_data/models/username_modelname.
 
 Example:
 python download-model.py facebook/opt-1.3b
@@ -175,7 +175,7 @@ class ModelDownloader:
         if model_dir:
             base_folder = model_dir
         else:
-            base_folder = 'models' if not is_lora else 'loras'
+            base_folder = 'user_data/models' if not is_lora else 'user_data/loras'
 
         # If the model is of type GGUF, save directly in the base_folder
         if is_llamacpp:
@@ -356,7 +356,7 @@ if __name__ == '__main__':
     parser.add_argument('--specific-file', type=str, default=None, help='Name of the specific file to download (if not provided, downloads all).')
     parser.add_argument('--exclude-pattern', type=str, default=None, help='Regex pattern to exclude files from download.')
     parser.add_argument('--output', type=str, default=None, help='Save the model files to this folder.')
-    parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/models).')
+    parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/user_data/models).')
     parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
     parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.')
     parser.add_argument('--max-retries', type=int, default=7, help='Max retries count when get error in download time.')
diff --git a/extensions/Training_PRO/matplotgraph.py b/extensions/Training_PRO/matplotgraph.py
index 348fc01a..b30bee83 100644
--- a/extensions/Training_PRO/matplotgraph.py
+++ b/extensions/Training_PRO/matplotgraph.py
@@ -59,4 +59,4 @@ def create_graph(lora_path, lora_name):
             print(f"File 'training_graph.json' does not exist in the {lora_path}")
       
     except ImportError:
-        print("matplotlib is not installed. Please install matplotlib to create PNG graphs")
\ No newline at end of file
+        print("matplotlib is not installed. Please install matplotlib to create PNG graphs")
diff --git a/extensions/Training_PRO/script.py b/extensions/Training_PRO/script.py
index f553e482..cb11a8df 100644
--- a/extensions/Training_PRO/script.py
+++ b/extensions/Training_PRO/script.py
@@ -175,23 +175,23 @@ def ui():
                     with gr.Row():
                         with gr.Column():
                             with gr.Row():
-                                dataset = gr.Dropdown(choices=get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'])
-                                create_refresh_button(dataset, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'json')}, 'refresh-button')
+                                dataset = gr.Dropdown(choices=get_datasets('user_data/training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'])
+                                create_refresh_button(dataset, lambda: None, lambda: {'choices': get_datasets('user_data/training/datasets', 'json')}, 'refresh-button')
                             with gr.Row():
-                                eval_dataset = gr.Dropdown(choices=get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'])
-                                create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'json')}, 'refresh-button')
+                                eval_dataset = gr.Dropdown(choices=get_datasets('user_data/training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'])
+                                create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': get_datasets('user_data/training/datasets', 'json')}, 'refresh-button')
 
                         with gr.Column():
                             with gr.Row():
-                                format = gr.Dropdown(choices=get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'])
-                                create_refresh_button(format, lambda: None, lambda: {'choices': get_datasets('training/formats', 'json')}, 'refresh-button')
+                                format = gr.Dropdown(choices=get_datasets('user_data/training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'])
+                                create_refresh_button(format, lambda: None, lambda: {'choices': get_datasets('user_data/training/formats', 'json')}, 'refresh-button')
                             with gr.Row():
                                 eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
 
                 with gr.Tab(label="Text file"):
                     with gr.Row():
-                        raw_text_file = gr.Dropdown(choices=get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The text file to use for training.', elem_classes=['slim-dropdown'])
-                        create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'txt')}, 'refresh-button')
+                        raw_text_file = gr.Dropdown(choices=get_datasets('user_data/training/datasets', 'txt'), value='None', label='Text file', info='The text file to use for training.', elem_classes=['slim-dropdown'])
+                        create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': get_datasets('user_data/training/datasets', 'txt')}, 'refresh-button')
 
                     with gr.Row():
                         with gr.Column():
@@ -208,7 +208,7 @@ def ui():
                             download_file_url = gr.Textbox(label='Download JSON or txt file to datasets (or formats) folder', value='',info='The URL of a file to download. If on github, make sure you get url of the raw file (https://raw.githubusercontent.com/...). If huggin face, make sure the url has /resolve/ in it not /blob/')
                             with gr.Row():
                                 download_check_overwrite = gr.Checkbox(label='Overwrite', value=False, info='Overwrite if file exist')
-                                download_folder = gr.Radio(label="Destination", value='training/datasets', choices=['training/datasets', 'training/formats'], interactive=True)
+                                download_folder = gr.Radio(label="Destination", value='user_data/training/datasets', choices=['user_data/training/datasets', 'user_data/training/formats'], interactive=True)
                             download_button = gr.Button('Download')
                             download_status = gr.Textbox(label='Download Status', value='', interactive=False)
                 with gr.Row():
@@ -235,7 +235,7 @@ def ui():
         with gr.Row():
             with gr.Column():
                 models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True)
-                evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.')
+                evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + get_datasets('user_data/training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under user_data/training/datasets.')
                 with gr.Row():
                     with gr.Column():
                         stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
@@ -310,7 +310,7 @@ def ui():
         
         if raw_text_file not in ['None', '']:
             logger.info("Loading Text file...")
-            fullpath = clean_path('training/datasets', f'{raw_text_file}')
+            fullpath = clean_path('user_data/training/datasets', f'{raw_text_file}')
             fullpath = Path(fullpath)
             if fullpath.is_dir():
                 logger.info('Training path directory {}'.format(raw_text_file))
@@ -324,10 +324,10 @@ def ui():
                         logger.info(f"Loaded training file: {file_path.name}")
             else:
                 try:
-                    with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
+                    with open(clean_path('user_data/training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
                         raw_text = file.read().replace('\r', '')
                 except:
-                    yield f"{raw_text_file}.txt doesn't seem to exsist anymore... check your training/datasets folder"
+                    yield f"{raw_text_file}.txt doesn't seem to exsist anymore... check your user_data/training/datasets folder"
                     return
             
  
@@ -353,7 +353,7 @@ def ui():
                 yield "Select format choice for dataset."
                 return
 
-            with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
+            with open(clean_path('user_data/training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
                 format_data: dict[str, str] = json.load(formatFile)
 
             def generate_prompt(data_point: dict[str, str]):
@@ -381,7 +381,7 @@ def ui():
                 return tokenize_dummy(prompt)
 
             logger.info("Loading JSON datasets...")
-            data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
+            data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json'))
             
             data_keys = [] 
 
@@ -456,7 +456,7 @@ def ui():
     #debug_slicer.change(lambda x: non_serialized_params.update({"debug_slicer": x}), debug_slicer, None)
 
     def update_dataset():
-        return gr.update(choices=get_datasets('training/datasets', 'json')), gr.update(choices=get_datasets('training/datasets', 'txt'))
+        return gr.update(choices=get_datasets('user_data/training/datasets', 'json')), gr.update(choices=get_datasets('user_data/training/datasets', 'txt'))
 
     download_button.click(download_file_from_url, [download_file_url,download_check_overwrite,download_folder] , download_status).then(update_dataset,None,[dataset , raw_text_file])
 
@@ -670,7 +670,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
     if raw_text_file not in ['None', '']:
         train_template["template_type"] = "raw_text"
         logger.info("Loading text file...")
-        fullpath = clean_path('training/datasets', f'{raw_text_file}')
+        fullpath = clean_path('user_data/training/datasets', f'{raw_text_file}')
         fullpath = Path(fullpath)
         if fullpath.is_dir():
             logger.info('Training path directory {}'.format(raw_text_file))
@@ -683,7 +683,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
                     logger.info(f"Loaded training file: {file_path.name}")
         else:
-            with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
+            with open(clean_path('user_data/training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
                 raw_text = file.read().replace('\r', '')
         
         # FPHAM PRECISE SLICING        
@@ -720,7 +720,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
         train_template["template_type"] = "dataset"
 
-        with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
+        with open(clean_path('user_data/training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
             format_data: dict[str, str] = json.load(formatFile)
 
         # == store training prompt ==
@@ -742,7 +742,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
             return tokenize(prompt, add_eos_token, add_bos_token)
 
         logger.info("Loading JSON datasets...")
-        data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
+        data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json'))
         train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
 
         print(f"BOS: {add_bos_token} EOS: {add_eos_token}") 
@@ -751,7 +751,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
         if eval_dataset == 'None':
             eval_data = None
         else:
-            eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json'))
+            eval_data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{eval_dataset}.json'))
             eval_data = eval_data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
 
     # == We MUST reload model if it went through any previous training, even failed one ==
@@ -1157,11 +1157,11 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
                 decoded_entries.append({"value": decoded_text})
 
             # Write the log file
-            Path('logs').mkdir(exist_ok=True)
-            with open(Path('logs/train_dataset_sample.json'), 'w') as json_file:
+            Path('user_data/logs').mkdir(exist_ok=True)
+            with open(Path('user_data/logs/train_dataset_sample.json'), 'w') as json_file:
                 json.dump(decoded_entries, json_file, indent=4)
 
-            logger.info("Log file 'train_dataset_sample.json' created in the 'logs' directory.")
+            logger.info("Log file 'train_dataset_sample.json' created in the 'user_data/logs' directory.")
         except Exception as e:
             logger.error(f"Failed to create log file due to error: {e}")
 
diff --git a/extensions/Training_PRO/train_utils.py b/extensions/Training_PRO/train_utils.py
index 18686144..79994880 100644
--- a/extensions/Training_PRO/train_utils.py
+++ b/extensions/Training_PRO/train_utils.py
@@ -194,13 +194,13 @@ def precise_cut(text: str, overlap: bool, min_chars_cut: int, eos_to_hc: bool, c
 
     if debug_slicer:
                     # Write the log file
-        Path('logs').mkdir(exist_ok=True)
+        Path('user_data/logs').mkdir(exist_ok=True)
         sentencelist_dict = {index: sentence for index, sentence in enumerate(sentencelist)}
-        output_file = "logs/sentencelist.json"
+        output_file = "user_data/logs/sentencelist.json"
         with open(output_file, 'w') as f:
             json.dump(sentencelist_dict, f,indent=2)
         
-        print("Saved sentencelist.json in logs folder")
+        print("Saved sentencelist.json in user_data/logs folder")
     
     return sentencelist   
 
@@ -281,13 +281,13 @@ def sliding_block_cut(text: str, min_chars_cut: int, eos_to_hc: bool, cutoff_len
 
     if debug_slicer:
                     # Write the log file
-        Path('logs').mkdir(exist_ok=True)
+        Path('user_data/logs').mkdir(exist_ok=True)
         sentencelist_dict = {index: sentence for index, sentence in enumerate(sentencelist)}
-        output_file = "logs/sentencelist.json"
+        output_file = "user_data/logs/sentencelist.json"
         with open(output_file, 'w') as f:
             json.dump(sentencelist_dict, f,indent=2)
         
-        print("Saved sentencelist.json in logs folder")
+        print("Saved sentencelist.json in user_data/logs folder")
     
     return sentencelist   
 
diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py
index 76be4a58..8b242fb6 100644
--- a/extensions/gallery/script.py
+++ b/extensions/gallery/script.py
@@ -72,13 +72,13 @@ def generate_html():
     global cards
     cards = []
     # Iterate through files in image folder
-    for file in sorted(Path("characters").glob("*")):
+    for file in sorted(Path("user_data/characters").glob("*")):
         if file.suffix in [".json", ".yml", ".yaml"]:
             character = file.stem
             container_html = '<div class="character-container">'
             image_html = "<div class='placeholder'></div>"
 
-            for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
+            for path in [Path(f"user_data/characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
                 if path.exists():
                     image_html = f'<img src="file/{get_image_cache(path)}">'
                     break
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index ea688897..4d6018f9 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -6,7 +6,7 @@ from pydantic import BaseModel, Field
 
 
 class GenerationOptions(BaseModel):
-    preset: str | None = Field(default=None, description="The name of a file under text-generation-webui/presets (without the .yaml extension). The sampling parameters that get overwritten by this option are the keys in the default_preset() function in modules/presets.py.")
+    preset: str | None = Field(default=None, description="The name of a file under text-generation-webui/user_data/presets (without the .yaml extension). The sampling parameters that get overwritten by this option are the keys in the default_preset() function in modules/presets.py.")
     dynatemp_low: float = 1
     dynatemp_high: float = 1
     dynatemp_exponent: float = 1
@@ -103,10 +103,10 @@ class ChatCompletionRequestParams(BaseModel):
 
     mode: str = Field(default='instruct', description="Valid options: instruct, chat, chat-instruct.")
 
-    instruction_template: str | None = Field(default=None, description="An instruction template defined under text-generation-webui/instruction-templates. If not set, the correct template will be automatically obtained from the model metadata.")
+    instruction_template: str | None = Field(default=None, description="An instruction template defined under text-generation-webui/user_data/instruction-templates. If not set, the correct template will be automatically obtained from the model metadata.")
     instruction_template_str: str | None = Field(default=None, description="A Jinja2 instruction template. If set, will take precedence over everything else.")
 
-    character: str | None = Field(default=None, description="A character defined under text-generation-webui/characters. If not set, the default \"Assistant\" character will be used.")
+    character: str | None = Field(default=None, description="A character defined under text-generation-webui/user_data/characters. If not set, the default \"Assistant\" character will be used.")
     bot_name: str | None = Field(default=None, description="Overwrites the value set by character field.", alias="name2")
     context: str | None = Field(default=None, description="Overwrites the value set by character field.")
     greeting: str | None = Field(default=None, description="Overwrites the value set by character field.")
diff --git a/js/main.js b/js/main.js
index c5c47d04..33b7d6bd 100644
--- a/js/main.js
+++ b/js/main.js
@@ -395,7 +395,7 @@ let bigPictureVisible = false;
 function addBigPicture() {
   var imgElement = document.createElement("img");
   var timestamp = new Date().getTime();
-  imgElement.src = "/file/cache/pfp_character.png?time=" + timestamp;
+  imgElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
   imgElement.classList.add("bigProfilePicture");
   imgElement.addEventListener("load", function () {
     this.style.visibility = "visible";
diff --git a/js/update_big_picture.js b/js/update_big_picture.js
index 4c094776..ec51d63b 100644
--- a/js/update_big_picture.js
+++ b/js/update_big_picture.js
@@ -2,6 +2,6 @@ function updateBigPicture() {
   var existingElement = document.querySelector(".bigProfilePicture");
   if (existingElement) {
     var timestamp = new Date().getTime();
-    existingElement.src = "/file/cache/pfp_character.png?time=" + timestamp;
+    existingElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
   }
 }
diff --git a/modules/chat.py b/modules/chat.py
index 94d90bdc..e117e6ee 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -525,9 +525,9 @@ def start_new_chat(state):
 
 def get_history_file_path(unique_id, character, mode):
     if mode == 'instruct':
-        p = Path(f'logs/instruct/{unique_id}.json')
+        p = Path(f'user_data/logs/instruct/{unique_id}.json')
     else:
-        p = Path(f'logs/chat/{character}/{unique_id}.json')
+        p = Path(f'user_data/logs/chat/{character}/{unique_id}.json')
 
     return p
 
@@ -563,13 +563,13 @@ def rename_history(old_id, new_id, character, mode):
 
 def get_paths(state):
     if state['mode'] == 'instruct':
-        return Path('logs/instruct').glob('*.json')
+        return Path('user_data/logs/instruct').glob('*.json')
     else:
         character = state['character_menu']
 
         # Handle obsolete filenames and paths
-        old_p = Path(f'logs/{character}_persistent.json')
-        new_p = Path(f'logs/persistent_{character}.json')
+        old_p = Path(f'user_data/logs/{character}_persistent.json')
+        new_p = Path(f'user_data/logs/persistent_{character}.json')
         if old_p.exists():
             logger.warning(f"Renaming \"{old_p}\" to \"{new_p}\"")
             old_p.rename(new_p)
@@ -581,7 +581,7 @@ def get_paths(state):
             p.parent.mkdir(exist_ok=True)
             new_p.rename(p)
 
-        return Path(f'logs/chat/{character}').glob('*.json')
+        return Path(f'user_data/logs/chat/{character}').glob('*.json')
 
 
 def find_all_histories(state):
@@ -732,7 +732,7 @@ def generate_pfp_cache(character):
     if not cache_folder.exists():
         cache_folder.mkdir()
 
-    for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
+    for path in [Path(f"user_data/characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
         if path.exists():
             original_img = Image.open(path)
             original_img.save(Path(f'{cache_folder}/pfp_character.png'), format='PNG')
@@ -752,12 +752,12 @@ def load_character(character, name1, name2):
 
     filepath = None
     for extension in ["yml", "yaml", "json"]:
-        filepath = Path(f'characters/{character}.{extension}')
+        filepath = Path(f'user_data/characters/{character}.{extension}')
         if filepath.exists():
             break
 
     if filepath is None or not filepath.exists():
-        logger.error(f"Could not find the character \"{character}\" inside characters/. No character has been loaded.")
+        logger.error(f"Could not find the character \"{character}\" inside user_data/characters. No character has been loaded.")
         raise ValueError
 
     file_contents = open(filepath, 'r', encoding='utf-8').read()
@@ -796,7 +796,7 @@ def load_instruction_template(template):
     if template == 'None':
         return ''
 
-    for filepath in [Path(f'instruction-templates/{template}.yaml'), Path('instruction-templates/Alpaca.yaml')]:
+    for filepath in [Path(f'user_data/instruction-templates/{template}.yaml'), Path('user_data/instruction-templates/Alpaca.yaml')]:
         if filepath.exists():
             break
     else:
@@ -838,17 +838,17 @@ def upload_character(file, img, tavern=False):
 
     outfile_name = name
     i = 1
-    while Path(f'characters/{outfile_name}.yaml').exists():
+    while Path(f'user_data/characters/{outfile_name}.yaml').exists():
         outfile_name = f'{name}_{i:03d}'
         i += 1
 
-    with open(Path(f'characters/{outfile_name}.yaml'), 'w', encoding='utf-8') as f:
+    with open(Path(f'user_data/characters/{outfile_name}.yaml'), 'w', encoding='utf-8') as f:
         f.write(yaml_data)
 
     if img is not None:
-        img.save(Path(f'characters/{outfile_name}.png'))
+        img.save(Path(f'user_data/characters/{outfile_name}.png'))
 
-    logger.info(f'New character saved to "characters/{outfile_name}.yaml".')
+    logger.info(f'New character saved to "user_data/characters/{outfile_name}.yaml".')
     return gr.update(value=outfile_name, choices=get_available_characters())
 
 
@@ -923,9 +923,9 @@ def save_character(name, greeting, context, picture, filename):
         return
 
     data = generate_character_yaml(name, greeting, context)
-    filepath = Path(f'characters/{filename}.yaml')
+    filepath = Path(f'user_data/characters/{filename}.yaml')
     save_file(filepath, data)
-    path_to_img = Path(f'characters/{filename}.png')
+    path_to_img = Path(f'user_data/characters/{filename}.png')
     if picture is not None:
         picture.save(path_to_img)
         logger.info(f'Saved {path_to_img}.')
@@ -933,9 +933,9 @@ def save_character(name, greeting, context, picture, filename):
 
 def delete_character(name, instruct=False):
     for extension in ["yml", "yaml", "json"]:
-        delete_file(Path(f'characters/{name}.{extension}'))
+        delete_file(Path(f'user_data/characters/{name}.{extension}'))
 
-    delete_file(Path(f'characters/{name}.png'))
+    delete_file(Path(f'user_data/characters/{name}.png'))
 
 
 def jinja_template_from_old_format(params, verbose=False):
@@ -1238,7 +1238,7 @@ def handle_save_template_click(instruction_template_str):
     contents = generate_instruction_template_yaml(instruction_template_str)
     return [
         "My Template.yaml",
-        "instruction-templates/",
+        "user_data/instruction-templates/",
         contents,
         gr.update(visible=True)
     ]
@@ -1247,7 +1247,7 @@ def handle_save_template_click(instruction_template_str):
 def handle_delete_template_click(template):
     return [
         f"{template}.yaml",
-        "instruction-templates/",
+        "user_data/instruction-templates/",
         gr.update(visible=False)
     ]
 
diff --git a/modules/evaluate.py b/modules/evaluate.py
index ba0de378..4f41c1fc 100644
--- a/modules/evaluate.py
+++ b/modules/evaluate.py
@@ -12,8 +12,8 @@ from modules.text_generation import encode
 
 
 def load_past_evaluations():
-    if Path('logs/evaluations.csv').exists():
-        df = pd.read_csv(Path('logs/evaluations.csv'), dtype=str)
+    if Path('user_data/logs/evaluations.csv').exists():
+        df = pd.read_csv(Path('user_data/logs/evaluations.csv'), dtype=str)
         df['Perplexity'] = pd.to_numeric(df['Perplexity'])
         return df
     else:
@@ -26,7 +26,7 @@ past_evaluations = load_past_evaluations()
 def save_past_evaluations(df):
     global past_evaluations
     past_evaluations = df
-    filepath = Path('logs/evaluations.csv')
+    filepath = Path('user_data/logs/evaluations.csv')
     filepath.parent.mkdir(parents=True, exist_ok=True)
     df.to_csv(filepath, index=False)
 
@@ -69,7 +69,7 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
         data = load_dataset('ptb_text_only', 'penn_treebank', split='test')
         text = " ".join(data['sentence'])
     else:
-        with open(Path(f'training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f:
+        with open(Path(f'user_data/training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f:
             text = f.read()
 
     for model in models:
diff --git a/modules/html_generator.py b/modules/html_generator.py
index a72e4859..c5252c26 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -387,13 +387,13 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
 
     # We use ?character and ?time.time() to force the browser to reset caches
     img_bot = (
-        f'<img src="file/cache/pfp_character_thumb.png?{character}" class="pfp_character">'
-        if Path("cache/pfp_character_thumb.png").exists() else ''
+        f'<img src="file/user_data/cache/pfp_character_thumb.png?{character}" class="pfp_character">'
+        if Path("user_data/cache/pfp_character_thumb.png").exists() else ''
     )
 
     img_me = (
-        f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">'
-        if Path("cache/pfp_me.png").exists() else ''
+        f'<img src="file/user_data/cache/pfp_me.png?{time.time() if reset_cache else ""}">'
+        if Path("user_data/cache/pfp_me.png").exists() else ''
     )
 
     for i in range(len(history['visible'])):
diff --git a/modules/models_settings.py b/modules/models_settings.py
index d3ecd51f..ae589bb3 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -25,7 +25,7 @@ def get_fallback_settings():
 def get_model_metadata(model):
     model_settings = {}
 
-    # Get settings from models/config.yaml and models/config-user.yaml
+    # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
     settings = shared.model_config
     for pat in settings:
         if re.match(pat.lower(), Path(model).name.lower()):
@@ -144,7 +144,7 @@ def get_model_metadata(model):
     if 'rope_freq_base' in model_settings and model_settings['rope_freq_base'] == 10000:
         model_settings.pop('rope_freq_base')
 
-    # Apply user settings from models/config-user.yaml
+    # Apply user settings from user_data/models/config-user.yaml
     settings = shared.user_config
     for pat in settings:
         if re.match(pat.lower(), Path(model).name.lower()):
@@ -223,7 +223,7 @@ def apply_model_settings_to_state(model, state):
 
 def save_model_settings(model, state):
     '''
-    Save the settings for this model to models/config-user.yaml
+    Save the settings for this model to user_data/models/config-user.yaml
     '''
     if model == 'None':
         yield ("Not saving the settings because no model is selected in the menu.")
diff --git a/modules/presets.py b/modules/presets.py
index 7cab2af0..a432bf52 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -58,7 +58,7 @@ def presets_params():
 def load_preset(name, verbose=False):
     generate_params = default_preset()
     if name not in ['None', None, '']:
-        path = Path(f'presets/{name}.yaml')
+        path = Path(f'user_data/presets/{name}.yaml')
         if path.exists():
             with open(path, 'r') as infile:
                 preset = yaml.safe_load(infile)
diff --git a/modules/prompts.py b/modules/prompts.py
index 565c2450..8f00cac2 100644
--- a/modules/prompts.py
+++ b/modules/prompts.py
@@ -7,7 +7,7 @@ def load_prompt(fname):
     if fname in ['None', '']:
         return ''
     else:
-        file_path = Path(f'prompts/{fname}.txt')
+        file_path = Path(f'user_data/prompts/{fname}.txt')
         if not file_path.exists():
             return ''
 
diff --git a/modules/shared.py b/modules/shared.py
index 96f65929..21e6dd00 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -78,8 +78,8 @@ group.add_argument('--multi-user', action='store_true', help='Multi-user mode. C
 group.add_argument('--character', type=str, help='The name of the character to load in chat mode by default.')
 group.add_argument('--model', type=str, help='Name of the model to load by default.')
 group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
-group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.')
-group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.')
+group.add_argument('--model-dir', type=str, default='user_data/models', help='Path to directory with all the models.')
+group.add_argument('--lora-dir', type=str, default='user_data/loras', help='Path to directory with all the loras.')
 group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
 group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
 group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
@@ -95,7 +95,7 @@ group = parser.add_argument_group('Transformers/Accelerate')
 group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
 group.add_argument('--cpu-memory', type=float, default=0, help='Maximum CPU memory in GiB. Use this for CPU offloading.')
 group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
-group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".')
+group.add_argument('--disk-cache-dir', type=str, default='user_data/cache', help='Directory to save the disk cache to. Defaults to "user_data/cache".')
 group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
 group.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 group.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.')
@@ -207,7 +207,7 @@ group.add_argument('--nowebui', action='store_true', help='Do not launch the Gra
 group = parser.add_argument_group('Deprecated')
 
 # Handle CMD_FLAGS.txt
-cmd_flags_path = Path(__file__).parent.parent / "CMD_FLAGS.txt"
+cmd_flags_path = Path(__file__).parent.parent / "user_data" / "CMD_FLAGS.txt"
 if cmd_flags_path.exists():
     with cmd_flags_path.open('r', encoding='utf-8') as f:
         cmd_flags = ' '.join(
diff --git a/modules/training.py b/modules/training.py
index 69142463..2354c39d 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -106,23 +106,23 @@ def create_ui():
                 with gr.Column():
                     with gr.Tab(label='Formatted Dataset'):
                         with gr.Row():
-                            format = gr.Dropdown(choices=utils.get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'], interactive=not mu)
-                            ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('training/formats', 'json')}, 'refresh-button', interactive=not mu)
+                            format = gr.Dropdown(choices=utils.get_datasets('user_data/training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'], interactive=not mu)
+                            ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/formats', 'json')}, 'refresh-button', interactive=not mu)
 
                         with gr.Row():
-                            dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu)
-                            ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button', interactive=not mu)
+                            dataset = gr.Dropdown(choices=utils.get_datasets('user_data/training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu)
+                            ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/datasets', 'json')}, 'refresh-button', interactive=not mu)
 
                         with gr.Row():
-                            eval_dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'], interactive=not mu)
-                            ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button', interactive=not mu)
+                            eval_dataset = gr.Dropdown(choices=utils.get_datasets('user_data/training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'], interactive=not mu)
+                            ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/datasets', 'json')}, 'refresh-button', interactive=not mu)
 
                         eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
 
                     with gr.Tab(label="Raw text file"):
                         with gr.Row():
-                            raw_text_file = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu)
-                            ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'txt')}, 'refresh-button', interactive=not mu)
+                            raw_text_file = gr.Dropdown(choices=utils.get_datasets('user_data/training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu)
+                            ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/datasets', 'txt')}, 'refresh-button', interactive=not mu)
 
                         with gr.Row():
                             with gr.Column():
@@ -143,7 +143,7 @@ def create_ui():
             with gr.Row():
                 with gr.Column():
                     models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True, interactive=not mu)
-                    evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.', interactive=not mu)
+                    evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('user_data/training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under user_data/training/datasets.', interactive=not mu)
                     with gr.Row():
                         with gr.Column():
                             stride_length = gr.Slider(label='Stride', minimum=0, maximum=32768, value=512, step=256, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
@@ -402,7 +402,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
     if raw_text_file not in ['None', '']:
         train_template["template_type"] = "raw_text"
         logger.info("Loading raw text file dataset")
-        fullpath = clean_path('training/datasets', f'{raw_text_file}')
+        fullpath = clean_path('user_data/training/datasets', f'{raw_text_file}')
         fullpath = Path(fullpath)
         if fullpath.is_dir():
             logger.info('Training path directory {}'.format(raw_text_file))
@@ -415,7 +415,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
 
                     logger.info(f"Loaded training file: {file_path.name}")
         else:
-            with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
+            with open(clean_path('user_data/training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
                 raw_text = file.read().replace('\r', '')
 
         cut_string = hard_cut_string.replace('\\n', '\n')
@@ -460,7 +460,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
 
         train_template["template_type"] = "dataset"
 
-        with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
+        with open(clean_path('user_data/training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
             format_data: dict[str, str] = json.load(formatFile)
 
         # == store training prompt ==
@@ -482,13 +482,13 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
             return tokenize(prompt, add_eos_token)
 
         logger.info("Loading JSON datasets")
-        data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
+        data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json'))
         train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
 
         if eval_dataset == 'None':
             eval_data = None
         else:
-            eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json'))
+            eval_data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{eval_dataset}.json'))
             eval_data = eval_data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
 
     # == We MUST reload model if it went through any previous training, even failed one ==
@@ -676,11 +676,11 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
                 decoded_entries.append({"value": decoded_text})
 
             # Write the log file
-            Path('logs').mkdir(exist_ok=True)
-            with open(Path('logs/train_dataset_sample.json'), 'w') as json_file:
+            Path('user_data/logs').mkdir(exist_ok=True)
+            with open(Path('user_data/logs/train_dataset_sample.json'), 'w') as json_file:
                 json.dump(decoded_entries, json_file, indent=4)
 
-            logger.info("Log file 'train_dataset_sample.json' created in the 'logs' directory.")
+            logger.info("Log file 'train_dataset_sample.json' created in the 'user_data/logs' directory.")
         except Exception as e:
             logger.error(f"Failed to create log file due to error: {e}")
 
diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py
index add3be66..905f5c47 100644
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@@ -249,7 +249,7 @@ def load_model_HF(model_name):
                     )
 
             if shared.args.disk:
-                params['offload_folder'] = shared.args.disk_cache_dir
+                params['offload_folder'] = str(Path(shared.args.disk_cache_dir))
 
         if shared.args.compress_pos_emb > 1:
             params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
diff --git a/modules/ui.py b/modules/ui.py
index 68cb76a6..ef5ed0e6 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -94,7 +94,7 @@ if not shared.args.old_colors:
         input_radius='0.375rem',
     )
 
-if Path("notification.mp3").exists():
+if Path("user_data/notification.mp3").exists():
     audio_notification_js = "document.querySelector('#audio_notification audio')?.play();"
 else:
     audio_notification_js = ""
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index b823b8e5..0d588549 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -146,7 +146,7 @@ def create_chat_settings_ui():
 
             with gr.Column(scale=1):
                 shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil', interactive=not mu)
-                shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None, interactive=not mu)
+                shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('user_data/cache/pfp_me.png')) if Path('user_data/cache/pfp_me.png').exists() else None, interactive=not mu)
 
     with gr.Tab('Instruction template'):
         with gr.Row():
diff --git a/modules/ui_default.py b/modules/ui_default.py
index ccae9a5e..c2946b37 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -102,7 +102,7 @@ def handle_save_prompt(text):
     return [
         text,
         utils.current_time() + ".txt",
-        "prompts/",
+        "user_data/prompts/",
         gr.update(visible=True)
     ]
 
@@ -110,6 +110,6 @@ def handle_save_prompt(text):
 def handle_delete_prompt(prompt):
     return [
         prompt + ".txt",
-        "prompts/",
+        "user_data/prompts/",
         gr.update(visible=True)
     ]
diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
index 3a27e1b9..d1f9379b 100644
--- a/modules/ui_file_saving.py
+++ b/modules/ui_file_saving.py
@@ -28,7 +28,7 @@ def create_ui():
 
     # Character saver/deleter
     with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['character_saver']:
-        shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your characters/ folder with this base filename.')
+        shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your user_data/characters folder with this base filename.')
         with gr.Row():
             shared.gradio['save_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
             shared.gradio['save_character_confirm'] = gr.Button('Save', elem_classes="small-button", variant='primary', interactive=not mu)
@@ -41,7 +41,7 @@ def create_ui():
 
     # Preset saver
     with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['preset_saver']:
-        shared.gradio['save_preset_filename'] = gr.Textbox(lines=1, label='File name', info='The preset will be saved to your presets/ folder with this base filename.')
+        shared.gradio['save_preset_filename'] = gr.Textbox(lines=1, label='File name', info='The preset will be saved to your user_data/presets folder with this base filename.')
         shared.gradio['save_preset_contents'] = gr.Textbox(lines=10, label='File contents')
         with gr.Row():
             shared.gradio['save_preset_cancel'] = gr.Button('Cancel', elem_classes="small-button")
@@ -72,7 +72,7 @@ def create_event_handlers():
 
 def handle_save_preset_confirm_click(filename, contents):
     try:
-        utils.save_file(f"presets/{filename}.yaml", contents)
+        utils.save_file(f"user_data/presets/{filename}.yaml", contents)
         available_presets = utils.get_available_presets()
         output = gr.update(choices=available_presets, value=filename)
     except Exception:
@@ -145,7 +145,7 @@ def handle_save_preset_click(state):
 def handle_delete_preset_click(preset):
     return [
         f"{preset}.yaml",
-        "presets/",
+        "user_data/presets/",
         gr.update(visible=True)
     ]
 
@@ -154,7 +154,7 @@ def handle_save_grammar_click(grammar_string):
     return [
         grammar_string,
         "My Fancy Grammar.gbnf",
-        "grammars/",
+        "user_data/grammars/",
         gr.update(visible=True)
     ]
 
@@ -162,6 +162,6 @@ def handle_save_grammar_click(grammar_string):
 def handle_delete_grammar_click(grammar_file):
     return [
         grammar_file,
-        "grammars/",
+        "user_data/grammars/",
         gr.update(visible=True)
     ]
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 6bd647c6..dc09c899 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -223,9 +223,9 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
             model_dir=shared.args.model_dir if shared.args.model_dir != shared.args_defaults.model_dir else None
         )
 
-        if output_folder == Path("models"):
+        if output_folder == Path("user_data/models"):
             output_folder = Path(shared.args.model_dir)
-        elif output_folder == Path("loras"):
+        elif output_folder == Path("user_data/loras"):
             output_folder = Path(shared.args.lora_dir)
 
         if check:
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 156e4128..6c2715af 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -128,7 +128,7 @@ def get_truncation_length():
 
 
 def load_grammar(name):
-    p = Path(f'grammars/{name}')
+    p = Path(f'user_data/grammars/{name}')
     if p.exists():
         return open(p, 'r', encoding='utf-8').read()
     else:
diff --git a/modules/ui_session.py b/modules/ui_session.py
index 66386d12..42434e51 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -48,7 +48,7 @@ def handle_save_settings(state, preset, extensions, show_controls, theme):
     return [
         contents,
         "settings.yaml",
-        "./",
+        "./user_data",
         gr.update(visible=True)
     ]
 
diff --git a/modules/utils.py b/modules/utils.py
index 269561aa..77324139 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -98,7 +98,7 @@ def get_available_models():
     dirs_with_gguf = set()
     for gguf_path in gguf_files:
         path = Path(gguf_path)
-        if path.parts:
+        if len(path.parts) > 0:
             dirs_with_gguf.add(path.parts[0])
 
     # Find directories with safetensors files
@@ -141,11 +141,11 @@ def get_available_ggufs():
 
 
 def get_available_presets():
-    return sorted(set((k.stem for k in Path('presets').glob('*.yaml'))), key=natural_keys)
+    return sorted(set((k.stem for k in Path('user_data/presets').glob('*.yaml'))), key=natural_keys)
 
 
 def get_available_prompts():
-    prompt_files = list(Path('prompts').glob('*.txt'))
+    prompt_files = list(Path('user_data/prompts').glob('*.txt'))
     sorted_files = sorted(prompt_files, key=lambda x: x.stat().st_mtime, reverse=True)
     prompts = [file.stem for file in sorted_files]
     prompts.append('None')
@@ -153,12 +153,12 @@ def get_available_prompts():
 
 
 def get_available_characters():
-    paths = (x for x in Path('characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
+    paths = (x for x in Path('user_data/characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
     return sorted(set((k.stem for k in paths)), key=natural_keys)
 
 
 def get_available_instruction_templates():
-    path = "instruction-templates"
+    path = "user_data/instruction-templates"
     paths = []
     if os.path.exists(path):
         paths = (x for x in Path(path).iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
@@ -189,4 +189,4 @@ def get_available_chat_styles():
 
 
 def get_available_grammars():
-    return ['None'] + sorted([item.name for item in list(Path('grammars').glob('*.gbnf'))], key=natural_keys)
+    return ['None'] + sorted([item.name for item in list(Path('user_data/grammars').glob('*.gbnf'))], key=natural_keys)
diff --git a/one_click.py b/one_click.py
index 5e3d691b..065afd99 100644
--- a/one_click.py
+++ b/one_click.py
@@ -293,10 +293,10 @@ def install_webui():
 
     # Write a flag to CMD_FLAGS.txt for CPU mode
     if selected_gpu == "NONE":
-        cmd_flags_path = os.path.join(script_dir, "CMD_FLAGS.txt")
+        cmd_flags_path = os.path.join(script_dir, "user_data", "CMD_FLAGS.txt")
         with open(cmd_flags_path, 'r+') as cmd_flags_file:
             if "--cpu" not in cmd_flags_file.read():
-                print_big_message("Adding the --cpu flag to CMD_FLAGS.txt.")
+                print_big_message("Adding the --cpu flag to user_data/CMD_FLAGS.txt.")
                 cmd_flags_file.write("\n--cpu\n")
 
     # Handle CUDA version display
@@ -532,7 +532,7 @@ if __name__ == "__main__":
             flags_list = re.split(' +(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)|=', flags)
             model_dir = [flags_list[(flags_list.index(flag) + 1)] for flag in flags_list if flag == '--model-dir'][0].strip('"\'')
         else:
-            model_dir = 'models'
+            model_dir = 'user_data/models'
 
         if len([item for item in glob.glob(f'{model_dir}/*') if not item.endswith(('.txt', '.yaml'))]) == 0:
             print_big_message("You haven't downloaded any model yet.\nOnce the web UI launches, head over to the \"Model\" tab and download one.")
diff --git a/server.py b/server.py
index 41a5660d..01d40ac4 100644
--- a/server.py
+++ b/server.py
@@ -94,8 +94,8 @@ def create_interface():
         'filter_by_loader': shared.args.loader or 'All'
     })
 
-    if Path("cache/pfp_character.png").exists():
-        Path("cache/pfp_character.png").unlink()
+    if Path("user_data/cache/pfp_character.png").exists():
+        Path("user_data/cache/pfp_character.png").unlink()
 
     # css/js strings
     css = ui.css
@@ -112,8 +112,8 @@ def create_interface():
         shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
 
         # Audio notification
-        if Path("notification.mp3").exists():
-            shared.gradio['audio_notification'] = gr.Audio(interactive=False, value="notification.mp3", elem_id="audio_notification", visible=False)
+        if Path("user_data/notification.mp3").exists():
+            shared.gradio['audio_notification'] = gr.Audio(interactive=False, value="user_data/notification.mp3", elem_id="audio_notification", visible=False)
 
         # Floating menus for saving/deleting files
         ui_file_saving.create_ui()
@@ -179,7 +179,7 @@ def create_interface():
             ssl_keyfile=shared.args.ssl_keyfile,
             ssl_certfile=shared.args.ssl_certfile,
             root_path=shared.args.subpath,
-            allowed_paths=["cache", "css", "extensions", "js"]
+            allowed_paths=["css", "js", "extensions", "user_data/cache"]
         )
 
 
@@ -192,10 +192,10 @@ if __name__ == "__main__":
     settings_file = None
     if shared.args.settings is not None and Path(shared.args.settings).exists():
         settings_file = Path(shared.args.settings)
-    elif Path('settings.yaml').exists():
-        settings_file = Path('settings.yaml')
-    elif Path('settings.json').exists():
-        settings_file = Path('settings.json')
+    elif Path('user_data/settings.yaml').exists():
+        settings_file = Path('user_data/settings.yaml')
+    elif Path('user_data/settings.json').exists():
+        settings_file = Path('user_data/settings.json')
 
     if settings_file is not None:
         logger.info(f"Loading settings from \"{settings_file}\"")
diff --git a/CMD_FLAGS.txt b/user_data/CMD_FLAGS.txt
similarity index 100%
rename from CMD_FLAGS.txt
rename to user_data/CMD_FLAGS.txt
diff --git a/characters/Assistant.yaml b/user_data/characters/Assistant.yaml
similarity index 100%
rename from characters/Assistant.yaml
rename to user_data/characters/Assistant.yaml
diff --git a/characters/Example.png b/user_data/characters/Example.png
similarity index 100%
rename from characters/Example.png
rename to user_data/characters/Example.png
diff --git a/characters/Example.yaml b/user_data/characters/Example.yaml
similarity index 100%
rename from characters/Example.yaml
rename to user_data/characters/Example.yaml
diff --git a/grammars/arithmetic.gbnf b/user_data/grammars/arithmetic.gbnf
similarity index 100%
rename from grammars/arithmetic.gbnf
rename to user_data/grammars/arithmetic.gbnf
diff --git a/grammars/c.gbnf b/user_data/grammars/c.gbnf
similarity index 100%
rename from grammars/c.gbnf
rename to user_data/grammars/c.gbnf
diff --git a/grammars/chess.gbnf b/user_data/grammars/chess.gbnf
similarity index 100%
rename from grammars/chess.gbnf
rename to user_data/grammars/chess.gbnf
diff --git a/grammars/json.gbnf b/user_data/grammars/json.gbnf
similarity index 100%
rename from grammars/json.gbnf
rename to user_data/grammars/json.gbnf
diff --git a/grammars/json_w_trailing_space.gbnf b/user_data/grammars/json_w_trailing_space.gbnf
similarity index 100%
rename from grammars/json_w_trailing_space.gbnf
rename to user_data/grammars/json_w_trailing_space.gbnf
diff --git a/grammars/list.gbnf b/user_data/grammars/list.gbnf
similarity index 100%
rename from grammars/list.gbnf
rename to user_data/grammars/list.gbnf
diff --git a/grammars/roleplay.gbnf b/user_data/grammars/roleplay.gbnf
similarity index 100%
rename from grammars/roleplay.gbnf
rename to user_data/grammars/roleplay.gbnf
diff --git a/grammars/simple_arithmetic.gbnf b/user_data/grammars/simple_arithmetic.gbnf
similarity index 100%
rename from grammars/simple_arithmetic.gbnf
rename to user_data/grammars/simple_arithmetic.gbnf
diff --git a/instruction-templates/Airoboros-v1.2.yaml b/user_data/instruction-templates/Airoboros-v1.2.yaml
similarity index 100%
rename from instruction-templates/Airoboros-v1.2.yaml
rename to user_data/instruction-templates/Airoboros-v1.2.yaml
diff --git a/instruction-templates/Alpaca.yaml b/user_data/instruction-templates/Alpaca.yaml
similarity index 100%
rename from instruction-templates/Alpaca.yaml
rename to user_data/instruction-templates/Alpaca.yaml
diff --git a/instruction-templates/Bactrian.yaml b/user_data/instruction-templates/Bactrian.yaml
similarity index 100%
rename from instruction-templates/Bactrian.yaml
rename to user_data/instruction-templates/Bactrian.yaml
diff --git a/instruction-templates/Baichuan Chat.yaml b/user_data/instruction-templates/Baichuan Chat.yaml
similarity index 100%
rename from instruction-templates/Baichuan Chat.yaml
rename to user_data/instruction-templates/Baichuan Chat.yaml
diff --git a/instruction-templates/Baize.yaml b/user_data/instruction-templates/Baize.yaml
similarity index 100%
rename from instruction-templates/Baize.yaml
rename to user_data/instruction-templates/Baize.yaml
diff --git a/instruction-templates/Bluemoon.yaml b/user_data/instruction-templates/Bluemoon.yaml
similarity index 100%
rename from instruction-templates/Bluemoon.yaml
rename to user_data/instruction-templates/Bluemoon.yaml
diff --git a/instruction-templates/ChatGLM.yaml b/user_data/instruction-templates/ChatGLM.yaml
similarity index 100%
rename from instruction-templates/ChatGLM.yaml
rename to user_data/instruction-templates/ChatGLM.yaml
diff --git a/instruction-templates/ChatML.yaml b/user_data/instruction-templates/ChatML.yaml
similarity index 100%
rename from instruction-templates/ChatML.yaml
rename to user_data/instruction-templates/ChatML.yaml
diff --git a/instruction-templates/Chinese-Vicuna-Chat.yaml b/user_data/instruction-templates/Chinese-Vicuna-Chat.yaml
similarity index 100%
rename from instruction-templates/Chinese-Vicuna-Chat.yaml
rename to user_data/instruction-templates/Chinese-Vicuna-Chat.yaml
diff --git a/instruction-templates/Command-R.yaml b/user_data/instruction-templates/Command-R.yaml
similarity index 100%
rename from instruction-templates/Command-R.yaml
rename to user_data/instruction-templates/Command-R.yaml
diff --git a/instruction-templates/Galactica Cite.yaml b/user_data/instruction-templates/Galactica Cite.yaml
similarity index 100%
rename from instruction-templates/Galactica Cite.yaml
rename to user_data/instruction-templates/Galactica Cite.yaml
diff --git a/instruction-templates/Galactica Finetuned.yaml b/user_data/instruction-templates/Galactica Finetuned.yaml
similarity index 100%
rename from instruction-templates/Galactica Finetuned.yaml
rename to user_data/instruction-templates/Galactica Finetuned.yaml
diff --git a/instruction-templates/Galactica Q.yaml b/user_data/instruction-templates/Galactica Q.yaml
similarity index 100%
rename from instruction-templates/Galactica Q.yaml
rename to user_data/instruction-templates/Galactica Q.yaml
diff --git a/instruction-templates/Galactica Summary.yaml b/user_data/instruction-templates/Galactica Summary.yaml
similarity index 100%
rename from instruction-templates/Galactica Summary.yaml
rename to user_data/instruction-templates/Galactica Summary.yaml
diff --git a/instruction-templates/Galactica Work.yaml b/user_data/instruction-templates/Galactica Work.yaml
similarity index 100%
rename from instruction-templates/Galactica Work.yaml
rename to user_data/instruction-templates/Galactica Work.yaml
diff --git a/instruction-templates/Galactica v2.yaml b/user_data/instruction-templates/Galactica v2.yaml
similarity index 100%
rename from instruction-templates/Galactica v2.yaml
rename to user_data/instruction-templates/Galactica v2.yaml
diff --git a/instruction-templates/Galactica.yaml b/user_data/instruction-templates/Galactica.yaml
similarity index 100%
rename from instruction-templates/Galactica.yaml
rename to user_data/instruction-templates/Galactica.yaml
diff --git a/instruction-templates/Gorilla.yaml b/user_data/instruction-templates/Gorilla.yaml
similarity index 100%
rename from instruction-templates/Gorilla.yaml
rename to user_data/instruction-templates/Gorilla.yaml
diff --git a/instruction-templates/Guanaco non-chat.yaml b/user_data/instruction-templates/Guanaco non-chat.yaml
similarity index 100%
rename from instruction-templates/Guanaco non-chat.yaml
rename to user_data/instruction-templates/Guanaco non-chat.yaml
diff --git a/instruction-templates/Guanaco-QLoRA.yaml b/user_data/instruction-templates/Guanaco-QLoRA.yaml
similarity index 100%
rename from instruction-templates/Guanaco-QLoRA.yaml
rename to user_data/instruction-templates/Guanaco-QLoRA.yaml
diff --git a/instruction-templates/H2O-prompt_answer.yaml b/user_data/instruction-templates/H2O-prompt_answer.yaml
similarity index 100%
rename from instruction-templates/H2O-prompt_answer.yaml
rename to user_data/instruction-templates/H2O-prompt_answer.yaml
diff --git a/instruction-templates/Hippogriff.yaml b/user_data/instruction-templates/Hippogriff.yaml
similarity index 100%
rename from instruction-templates/Hippogriff.yaml
rename to user_data/instruction-templates/Hippogriff.yaml
diff --git a/instruction-templates/INCITE-Chat.yaml b/user_data/instruction-templates/INCITE-Chat.yaml
similarity index 100%
rename from instruction-templates/INCITE-Chat.yaml
rename to user_data/instruction-templates/INCITE-Chat.yaml
diff --git a/instruction-templates/INCITE-Instruct.yaml b/user_data/instruction-templates/INCITE-Instruct.yaml
similarity index 100%
rename from instruction-templates/INCITE-Instruct.yaml
rename to user_data/instruction-templates/INCITE-Instruct.yaml
diff --git a/instruction-templates/KoAlpaca.yaml b/user_data/instruction-templates/KoAlpaca.yaml
similarity index 100%
rename from instruction-templates/KoAlpaca.yaml
rename to user_data/instruction-templates/KoAlpaca.yaml
diff --git a/instruction-templates/Koala.yaml b/user_data/instruction-templates/Koala.yaml
similarity index 100%
rename from instruction-templates/Koala.yaml
rename to user_data/instruction-templates/Koala.yaml
diff --git a/instruction-templates/LLaVA.yaml b/user_data/instruction-templates/LLaVA.yaml
similarity index 100%
rename from instruction-templates/LLaVA.yaml
rename to user_data/instruction-templates/LLaVA.yaml
diff --git a/instruction-templates/Llama-v2.yaml b/user_data/instruction-templates/Llama-v2.yaml
similarity index 100%
rename from instruction-templates/Llama-v2.yaml
rename to user_data/instruction-templates/Llama-v2.yaml
diff --git a/instruction-templates/Llama-v3.yaml b/user_data/instruction-templates/Llama-v3.yaml
similarity index 100%
rename from instruction-templates/Llama-v3.yaml
rename to user_data/instruction-templates/Llama-v3.yaml
diff --git a/instruction-templates/MOSS.yaml b/user_data/instruction-templates/MOSS.yaml
similarity index 100%
rename from instruction-templates/MOSS.yaml
rename to user_data/instruction-templates/MOSS.yaml
diff --git a/instruction-templates/Manticore Chat.yaml b/user_data/instruction-templates/Manticore Chat.yaml
similarity index 100%
rename from instruction-templates/Manticore Chat.yaml
rename to user_data/instruction-templates/Manticore Chat.yaml
diff --git a/instruction-templates/Metharme.yaml b/user_data/instruction-templates/Metharme.yaml
similarity index 100%
rename from instruction-templates/Metharme.yaml
rename to user_data/instruction-templates/Metharme.yaml
diff --git a/instruction-templates/Mistral.yaml b/user_data/instruction-templates/Mistral.yaml
similarity index 100%
rename from instruction-templates/Mistral.yaml
rename to user_data/instruction-templates/Mistral.yaml
diff --git a/instruction-templates/NVIDIA-ChatQA.yaml b/user_data/instruction-templates/NVIDIA-ChatQA.yaml
similarity index 100%
rename from instruction-templates/NVIDIA-ChatQA.yaml
rename to user_data/instruction-templates/NVIDIA-ChatQA.yaml
diff --git a/instruction-templates/NewHope.yaml b/user_data/instruction-templates/NewHope.yaml
similarity index 100%
rename from instruction-templates/NewHope.yaml
rename to user_data/instruction-templates/NewHope.yaml
diff --git a/instruction-templates/Open Assistant.yaml b/user_data/instruction-templates/Open Assistant.yaml
similarity index 100%
rename from instruction-templates/Open Assistant.yaml
rename to user_data/instruction-templates/Open Assistant.yaml
diff --git a/instruction-templates/OpenBuddy.yaml b/user_data/instruction-templates/OpenBuddy.yaml
similarity index 100%
rename from instruction-templates/OpenBuddy.yaml
rename to user_data/instruction-templates/OpenBuddy.yaml
diff --git a/instruction-templates/OpenChat.yaml b/user_data/instruction-templates/OpenChat.yaml
similarity index 100%
rename from instruction-templates/OpenChat.yaml
rename to user_data/instruction-templates/OpenChat.yaml
diff --git a/instruction-templates/OpenOrca-Platypus2.yaml b/user_data/instruction-templates/OpenOrca-Platypus2.yaml
similarity index 100%
rename from instruction-templates/OpenOrca-Platypus2.yaml
rename to user_data/instruction-templates/OpenOrca-Platypus2.yaml
diff --git a/instruction-templates/Orca Mini.yaml b/user_data/instruction-templates/Orca Mini.yaml
similarity index 100%
rename from instruction-templates/Orca Mini.yaml
rename to user_data/instruction-templates/Orca Mini.yaml
diff --git a/instruction-templates/Orca-Vicuna.yaml b/user_data/instruction-templates/Orca-Vicuna.yaml
similarity index 100%
rename from instruction-templates/Orca-Vicuna.yaml
rename to user_data/instruction-templates/Orca-Vicuna.yaml
diff --git a/instruction-templates/RWKV-Raven.yaml b/user_data/instruction-templates/RWKV-Raven.yaml
similarity index 100%
rename from instruction-templates/RWKV-Raven.yaml
rename to user_data/instruction-templates/RWKV-Raven.yaml
diff --git a/instruction-templates/RWKV-World.yaml b/user_data/instruction-templates/RWKV-World.yaml
similarity index 100%
rename from instruction-templates/RWKV-World.yaml
rename to user_data/instruction-templates/RWKV-World.yaml
diff --git a/instruction-templates/Samantha.yaml b/user_data/instruction-templates/Samantha.yaml
similarity index 100%
rename from instruction-templates/Samantha.yaml
rename to user_data/instruction-templates/Samantha.yaml
diff --git a/instruction-templates/StableBeluga2.yaml b/user_data/instruction-templates/StableBeluga2.yaml
similarity index 100%
rename from instruction-templates/StableBeluga2.yaml
rename to user_data/instruction-templates/StableBeluga2.yaml
diff --git a/instruction-templates/StableLM.yaml b/user_data/instruction-templates/StableLM.yaml
similarity index 100%
rename from instruction-templates/StableLM.yaml
rename to user_data/instruction-templates/StableLM.yaml
diff --git a/instruction-templates/StableVicuna.yaml b/user_data/instruction-templates/StableVicuna.yaml
similarity index 100%
rename from instruction-templates/StableVicuna.yaml
rename to user_data/instruction-templates/StableVicuna.yaml
diff --git a/instruction-templates/Starchat-Beta.yaml b/user_data/instruction-templates/Starchat-Beta.yaml
similarity index 100%
rename from instruction-templates/Starchat-Beta.yaml
rename to user_data/instruction-templates/Starchat-Beta.yaml
diff --git a/instruction-templates/Synthia-CoT.yaml b/user_data/instruction-templates/Synthia-CoT.yaml
similarity index 100%
rename from instruction-templates/Synthia-CoT.yaml
rename to user_data/instruction-templates/Synthia-CoT.yaml
diff --git a/instruction-templates/Synthia.yaml b/user_data/instruction-templates/Synthia.yaml
similarity index 100%
rename from instruction-templates/Synthia.yaml
rename to user_data/instruction-templates/Synthia.yaml
diff --git a/instruction-templates/Tulu.yaml b/user_data/instruction-templates/Tulu.yaml
similarity index 100%
rename from instruction-templates/Tulu.yaml
rename to user_data/instruction-templates/Tulu.yaml
diff --git a/instruction-templates/Vicuna-v0.yaml b/user_data/instruction-templates/Vicuna-v0.yaml
similarity index 100%
rename from instruction-templates/Vicuna-v0.yaml
rename to user_data/instruction-templates/Vicuna-v0.yaml
diff --git a/instruction-templates/Vicuna-v1.1.yaml b/user_data/instruction-templates/Vicuna-v1.1.yaml
similarity index 100%
rename from instruction-templates/Vicuna-v1.1.yaml
rename to user_data/instruction-templates/Vicuna-v1.1.yaml
diff --git a/instruction-templates/Vigogne-Chat.yaml b/user_data/instruction-templates/Vigogne-Chat.yaml
similarity index 100%
rename from instruction-templates/Vigogne-Chat.yaml
rename to user_data/instruction-templates/Vigogne-Chat.yaml
diff --git a/instruction-templates/Vigogne-Instruct.yaml b/user_data/instruction-templates/Vigogne-Instruct.yaml
similarity index 100%
rename from instruction-templates/Vigogne-Instruct.yaml
rename to user_data/instruction-templates/Vigogne-Instruct.yaml
diff --git a/instruction-templates/Wizard-Mega ShareGPT.yaml b/user_data/instruction-templates/Wizard-Mega ShareGPT.yaml
similarity index 100%
rename from instruction-templates/Wizard-Mega ShareGPT.yaml
rename to user_data/instruction-templates/Wizard-Mega ShareGPT.yaml
diff --git a/instruction-templates/Wizard-Mega.yaml b/user_data/instruction-templates/Wizard-Mega.yaml
similarity index 100%
rename from instruction-templates/Wizard-Mega.yaml
rename to user_data/instruction-templates/Wizard-Mega.yaml
diff --git a/instruction-templates/Ziya.yaml b/user_data/instruction-templates/Ziya.yaml
similarity index 100%
rename from instruction-templates/Ziya.yaml
rename to user_data/instruction-templates/Ziya.yaml
diff --git a/loras/place-your-loras-here.txt b/user_data/loras/place-your-loras-here.txt
similarity index 100%
rename from loras/place-your-loras-here.txt
rename to user_data/loras/place-your-loras-here.txt
diff --git a/models/config.yaml b/user_data/models/config.yaml
similarity index 100%
rename from models/config.yaml
rename to user_data/models/config.yaml
diff --git a/models/place-your-models-here.txt b/user_data/models/place-your-models-here.txt
similarity index 100%
rename from models/place-your-models-here.txt
rename to user_data/models/place-your-models-here.txt
diff --git a/presets/Contrastive Search.yaml b/user_data/presets/Contrastive Search.yaml
similarity index 100%
rename from presets/Contrastive Search.yaml
rename to user_data/presets/Contrastive Search.yaml
diff --git a/presets/Creative.yaml b/user_data/presets/Creative.yaml
similarity index 100%
rename from presets/Creative.yaml
rename to user_data/presets/Creative.yaml
diff --git a/presets/Deterministic.yaml b/user_data/presets/Deterministic.yaml
similarity index 100%
rename from presets/Deterministic.yaml
rename to user_data/presets/Deterministic.yaml
diff --git a/presets/Instruct.yaml b/user_data/presets/Instruct.yaml
similarity index 100%
rename from presets/Instruct.yaml
rename to user_data/presets/Instruct.yaml
diff --git a/presets/Null preset.yaml b/user_data/presets/Null preset.yaml
similarity index 100%
rename from presets/Null preset.yaml
rename to user_data/presets/Null preset.yaml
diff --git a/presets/min_p.yaml b/user_data/presets/min_p.yaml
similarity index 100%
rename from presets/min_p.yaml
rename to user_data/presets/min_p.yaml
diff --git a/prompts/Alpaca-with-Input.txt b/user_data/prompts/Alpaca-with-Input.txt
similarity index 100%
rename from prompts/Alpaca-with-Input.txt
rename to user_data/prompts/Alpaca-with-Input.txt
diff --git a/prompts/QA.txt b/user_data/prompts/QA.txt
similarity index 100%
rename from prompts/QA.txt
rename to user_data/prompts/QA.txt
diff --git a/training/datasets/put-trainer-datasets-here.txt b/user_data/training/datasets/put-trainer-datasets-here.txt
similarity index 100%
rename from training/datasets/put-trainer-datasets-here.txt
rename to user_data/training/datasets/put-trainer-datasets-here.txt
diff --git a/training/formats/ChatML-format.json b/user_data/training/formats/ChatML-format.json
similarity index 100%
rename from training/formats/ChatML-format.json
rename to user_data/training/formats/ChatML-format.json
diff --git a/training/formats/alpaca-chatbot-format.json b/user_data/training/formats/alpaca-chatbot-format.json
similarity index 100%
rename from training/formats/alpaca-chatbot-format.json
rename to user_data/training/formats/alpaca-chatbot-format.json
diff --git a/training/formats/alpaca-format.json b/user_data/training/formats/alpaca-format.json
similarity index 100%
rename from training/formats/alpaca-format.json
rename to user_data/training/formats/alpaca-format.json
diff --git a/training/formats/llama2-chat-format.json b/user_data/training/formats/llama2-chat-format.json
similarity index 100%
rename from training/formats/llama2-chat-format.json
rename to user_data/training/formats/llama2-chat-format.json
diff --git a/training/formats/vicuna-format.json b/user_data/training/formats/vicuna-format.json
similarity index 100%
rename from training/formats/vicuna-format.json
rename to user_data/training/formats/vicuna-format.json

From 763a7011c05567bcde6c30f637a22dd826bacb82 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 04:57:36 -0700
Subject: [PATCH 25/49] Remove an ancient/obsolete migration check

---
 modules/one_click_installer_check.py | 9 ---------
 server.py                            | 1 -
 2 files changed, 10 deletions(-)
 delete mode 100644 modules/one_click_installer_check.py

diff --git a/modules/one_click_installer_check.py b/modules/one_click_installer_check.py
deleted file mode 100644
index 4bde8600..00000000
--- a/modules/one_click_installer_check.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from pathlib import Path
-
-from modules.logging_colors import logger
-
-if Path('../webui.py').exists():
-    logger.warning('\nIt looks like you are running an outdated version of '
-                   'the one-click-installers.\n'
-                   'Please migrate your installation following the instructions here:\n'
-                   'https://github.com/oobabooga/text-generation-webui/wiki/Migrating-an-old-one%E2%80%90click-install')
diff --git a/server.py b/server.py
index 01d40ac4..169578a5 100644
--- a/server.py
+++ b/server.py
@@ -1,7 +1,6 @@
 import os
 import warnings
 
-import modules.one_click_installer_check
 from modules import shared
 from modules.block_requests import OpenMonkeyPatch, RequestBlocker
 from modules.logging_colors import logger

From b976112539b586fed6334a54b2a261a7c3569622 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 05:00:37 -0700
Subject: [PATCH 26/49] Remove the WSL installation scripts

They were useful in 2023 but now everything runs natively on Windows.
---
 .../workflows/build-portable-release-cuda.yml |   2 +-
 .../build-portable-release-vulkan.yml         |   2 +-
 .github/workflows/build-portable-release.yml  |   2 +-
 cmd_wsl.bat                                   |  11 --
 docs/10 - WSL.md                              | 146 ------------------
 start_wsl.bat                                 |  11 --
 update_wizard_wsl.bat                         |  11 --
 wsl.sh                                        | 115 --------------
 8 files changed, 3 insertions(+), 297 deletions(-)
 delete mode 100755 cmd_wsl.bat
 delete mode 100644 docs/10 - WSL.md
 delete mode 100755 start_wsl.bat
 delete mode 100755 update_wizard_wsl.bat
 delete mode 100755 wsl.sh

diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml
index fcc74408..aacd59f9 100644
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@@ -101,7 +101,7 @@ jobs:
       - name: Build Package
         shell: bash
         run: |
-            rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
 
             # Define common variables
             CUDA_VERSION="${{ matrix.cuda }}"
diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml
index d5aa764c..6f1e5ec8 100644
--- a/.github/workflows/build-portable-release-vulkan.yml
+++ b/.github/workflows/build-portable-release-vulkan.yml
@@ -100,7 +100,7 @@ jobs:
       - name: Build Package
         shell: bash
         run: |
-            rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
 
             # Define common variables
             AVX_SUPPORT="${{ matrix.avx }}"
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index 2424cc44..af886652 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -100,7 +100,7 @@ jobs:
       - name: Build Package
         shell: bash
         run: |
-            rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
 
             # Define common variables
             AVX_SUPPORT="${{ matrix.avx }}"
diff --git a/cmd_wsl.bat b/cmd_wsl.bat
deleted file mode 100755
index f9f4348a..00000000
--- a/cmd_wsl.bat
+++ /dev/null
@@ -1,11 +0,0 @@
-@echo off
-
-cd /D "%~dp0"
-
-set PATH=%PATH%;%SystemRoot%\system32
-
-@rem sed -i 's/\x0D$//' ./wsl.sh converts newlines to unix format in the wsl script
-call wsl -e bash -lic "sed -i 's/\x0D$//' ./wsl.sh; source ./wsl.sh cmd"
-
-:end
-pause
diff --git a/docs/10 - WSL.md b/docs/10 - WSL.md
deleted file mode 100644
index e0d66393..00000000
--- a/docs/10 - WSL.md	
+++ /dev/null
@@ -1,146 +0,0 @@
-## WSL instructions
-
-If you do not have WSL installed, follow the [instructions below](https://github.com/oobabooga/text-generation-webui/wiki/10-%E2%80%90-WSL#wsl-installation) first.
-
-### Additional WSL setup info
-
-If you want to install Linux to a drive other than C, open powershell and enter these commands:
-
-```
-cd D:\Path\To\Linux
-$ProgressPreference = 'SilentlyContinue'
-Invoke-WebRequest -Uri <LinuxDistroURL> -OutFile Linux.appx -UseBasicParsing
-mv Linux.appx Linux.zip
-```
-
-Then open Linux.zip and you should see several .appx files inside.
-
-The one with _x64.appx contains the exe installer that you need.
-
-Extract the contents of that _x64.appx file and run <distro>.exe to install.
-
-Linux Distro URLs: https://learn.microsoft.com/en-us/windows/wsl/install-manual#downloading-distributions
-
-**ENSURE THAT THE WSL LINUX DISTRO THAT YOU WISH TO USE IS SET AS THE DEFAULT!**
-
-Do this by using these commands:
-
-```
-wsl -l
-wsl -s <DistroName>
-```
-
-### Web UI Installation
-
-Run the "start" script. By default it will install the web UI in WSL:
-/home/{username}/text-gen-install
-
-To launch the web UI in the future after it is already installed, run
-the same "start" script. Ensure that one_click.py and wsl.sh are next to it!
-
-### Updating the web UI
-
-As an alternative to running the "update" script, you can also run "wsl.sh update" in WSL.
-
-### Running an interactive shell
-
-As an alternative to running the "cmd" script, you can also run "wsl.sh cmd" in WSL.
-
-### Changing the default install location
-
-To change this, you will need to edit the scripts as follows:
-wsl.sh: line ~22   INSTALL_DIR="/path/to/install/dir"
-
-Keep in mind that there is a long-standing bug in WSL that significantly
-slows drive read/write speeds when using a physical drive as opposed to
-the virtual one that Linux is installed in.
-
-## WSL installation
-
-Guide created by [@jfryton](https://github.com/jfryton). Thank you jfryton.
-
------
-
-Here's an easy-to-follow, step-by-step guide for installing Windows Subsystem for Linux (WSL) with Ubuntu on Windows 10/11:
-
-### Step 1: Enable WSL
-
-1. Press the Windows key + X and click on "Windows PowerShell (Admin)" or "Windows Terminal (Admin)" to open PowerShell or Terminal with administrator privileges.
-2. In the PowerShell window, type the following command and press Enter:
-
-```
-wsl --install
-```
-
-If this command doesn't work, you can enable WSL with the following command for Windows 10:
-
-```
-wsl --set-default-version 1
-```
-
-For Windows 11, you can use:
-
-```
-wsl --set-default-version 2
-```
-
-You may be prompted to restart your computer. If so, save your work and restart.
-
-### Step 2: Install Ubuntu
-
-1. Open the Microsoft Store.
-2. Search for "Ubuntu" in the search bar.
-3. Choose the desired Ubuntu version (e.g., Ubuntu 20.04 LTS) and click "Get" or "Install" to download and install the Ubuntu app.
-4. Once the installation is complete, click "Launch" or search for "Ubuntu" in the Start menu and open the app.
-
-### Step 3: Set up Ubuntu
-
-1. When you first launch the Ubuntu app, it will take a few minutes to set up. Be patient as it installs the necessary files and sets up your environment.
-2. Once the setup is complete, you will be prompted to create a new UNIX username and password. Choose a username and password, and make sure to remember them, as you will need them for future administrative tasks within the Ubuntu environment.
-
-### Step 4: Update and upgrade packages
-
-1. After setting up your username and password, it's a good idea to update and upgrade your Ubuntu system. Run the following commands in the Ubuntu terminal:
-
-```
-sudo apt update
-sudo apt upgrade
-```
-
-2. Enter your password when prompted. This will update the package list and upgrade any outdated packages.
-
-Congratulations! You have now installed WSL with Ubuntu on your Windows 10/11 system. You can use the Ubuntu terminal for various tasks, like running Linux commands, installing packages, or managing files.
-
-You can launch your WSL Ubuntu installation by selecting the Ubuntu app (like any other program installed on your computer) or typing 'ubuntu' into Powershell or Terminal.
-
-### Step 5: Proceed with Linux instructions
-
-1. You can now follow the Linux setup instructions. If you receive any error messages about a missing tool or package, just install them using apt:
-
-```
-sudo apt install [missing package]
-```
-
-You will probably need to install build-essential
-
-```
-sudo apt install build-essential
-```
-
-If you face any issues or need to troubleshoot, you can always refer to the official Microsoft documentation for WSL: https://docs.microsoft.com/en-us/windows/wsl/
-
-### WSL2 performance using /mnt: 
-
-When you git clone a repository, put it inside WSL and not outside. To understand more, take a look at this [issue](https://github.com/microsoft/WSL/issues/4197#issuecomment-604592340)
-
-### Bonus: Port Forwarding
-
-By default, you won't be able to access the webui from another device on your local network. You will need to setup the appropriate port forwarding using the following steps:
-
-1. First, get the IP address of the WSL by typing `wsl hostname -I`. This will output the IP address, for example `172.20.134.111`.
-2. Then, use the following command (using PowerShell or Terminal with administrator privileges) to set up port forwarding, replacing `172.20.134.111` with the IP address you obtained in step 1:
-
-```
-netsh interface portproxy add v4tov4 listenaddress=0.0.0.0 listenport=7860 connectaddress=172.20.134.111 connectport=7860
-```
-
diff --git a/start_wsl.bat b/start_wsl.bat
deleted file mode 100755
index d7bacead..00000000
--- a/start_wsl.bat
+++ /dev/null
@@ -1,11 +0,0 @@
-@echo off
-
-cd /D "%~dp0"
-
-set PATH=%PATH%;%SystemRoot%\system32
-
-@rem sed -i 's/\x0D$//' ./wsl.sh converts newlines to unix format in the wsl script
-call wsl -e bash -lic "sed -i 's/\x0D$//' ./wsl.sh; source ./wsl.sh %*"
-
-:end
-pause
diff --git a/update_wizard_wsl.bat b/update_wizard_wsl.bat
deleted file mode 100755
index 35f0a349..00000000
--- a/update_wizard_wsl.bat
+++ /dev/null
@@ -1,11 +0,0 @@
-@echo off
-
-cd /D "%~dp0"
-
-set PATH=%PATH%;%SystemRoot%\system32
-
-@rem sed -i 's/\x0D$//' ./wsl.sh converts newlines to unix format in the wsl script   calling wsl.sh with 'update' will run updater
-call wsl -e bash -lic "sed -i 's/\x0D$//' ./wsl.sh; source ./wsl.sh update-wizard"
-
-:end
-pause
diff --git a/wsl.sh b/wsl.sh
deleted file mode 100755
index c5d28b16..00000000
--- a/wsl.sh
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/bin/bash
-
-# detect if build-essential is missing or broken
-if ! dpkg-query -W -f'${Status}' "build-essential" 2>/dev/null | grep -q "ok installed"; then
-echo "build-essential not found or broken!
-
-A C++ compiler is required to build needed Python packages!
-To install one, run cmd_wsl.bat and enter these commands:
-
-sudo apt-get update
-sudo apt-get install build-essential
-"
-read -n1 -p "Continue the installer anyway? [y,n]" EXIT_PROMPT
-# only continue if user inputs 'y' else exit
-if ! [[ $EXIT_PROMPT == "Y" || $EXIT_PROMPT == "y" ]]; then exit; fi
-fi
-
-# deactivate existing conda envs as needed to avoid conflicts
-{ conda deactivate && conda deactivate && conda deactivate; } 2> /dev/null
-
-# config   unlike other scripts, can't use current directory due to file IO bug in WSL, needs to be in virtual drive
-INSTALL_DIR_PREFIX="$HOME/text-gen-install"
-if [[ ! $(realpath "$(pwd)/..") = /mnt/* ]]; then
-    INSTALL_DIR_PREFIX="$(realpath "$(pwd)/..")" && INSTALL_INPLACE=1
-fi
-INSTALL_DIR="$INSTALL_DIR_PREFIX/text-generation-webui"
-CONDA_ROOT_PREFIX="$INSTALL_DIR/installer_files/conda"
-INSTALL_ENV_DIR="$INSTALL_DIR/installer_files/env"
-MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py311_24.11.1-0-Linux-x86_64.sh"
-conda_exists="F"
-
-# environment isolation
-export PYTHONNOUSERSITE=1
-unset PYTHONPATH
-unset PYTHONHOME
-export CUDA_PATH="$INSTALL_ENV_DIR"
-export CUDA_HOME="$CUDA_PATH"
-
-# /usr/lib/wsl/lib needs to be added to LD_LIBRARY_PATH to fix years-old bug in WSL where GPU drivers aren't linked properly
-export LD_LIBRARY_PATH="$CUDA_HOME/lib:/usr/lib/wsl/lib:$LD_LIBRARY_PATH"
-
-# open bash cli if called with 'wsl.sh cmd' with workarounds for existing conda
-if [ "$1" == "cmd" ]; then
-    exec bash --init-file <(echo ". ~/.bashrc; conda deactivate 2> /dev/null; cd $INSTALL_DIR || cd $HOME; source $CONDA_ROOT_PREFIX/etc/profile.d/conda.sh; conda activate $INSTALL_ENV_DIR")
-    exit
-fi
-
-if [[ "$INSTALL_DIR" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
-
-# create install dir if missing
-if [ ! -d "$INSTALL_DIR" ]; then mkdir -p "$INSTALL_DIR" || exit; fi
-
-# figure out whether git and conda needs to be installed
-if "$CONDA_ROOT_PREFIX/bin/conda" --version &>/dev/null; then conda_exists="T"; fi
-
-# (if necessary) install git and conda into a contained environment
-# download miniconda
-if [ "$conda_exists" == "F" ]; then
-    echo "Downloading Miniconda from $MINICONDA_DOWNLOAD_URL to $INSTALL_DIR/miniconda_installer.sh"
-
-    curl -L "$MINICONDA_DOWNLOAD_URL" > "$INSTALL_DIR/miniconda_installer.sh"
-
-    chmod u+x "$INSTALL_DIR/miniconda_installer.sh"
-    bash "$INSTALL_DIR/miniconda_installer.sh" -b -p $CONDA_ROOT_PREFIX
-
-    # test the conda binary
-    echo "Miniconda version:"
-    "$CONDA_ROOT_PREFIX/bin/conda" --version
-
-    # delete the Miniconda installer
-    rm "$INSTALL_DIR/miniconda_installer.sh"
-fi
-
-# create the installer env
-if [ ! -e "$INSTALL_ENV_DIR" ]; then
-    "$CONDA_ROOT_PREFIX/bin/conda" create -y -k --prefix "$INSTALL_ENV_DIR" python=3.11 git
-fi
-
-# check if conda environment was actually created
-if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
-    echo "Conda environment is empty."
-    exit
-fi
-
-# activate installer env
-source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh" # otherwise conda complains about 'shell not initialized' (needed when running in a script)
-conda activate "$INSTALL_ENV_DIR"
-
-pushd $INSTALL_DIR 1> /dev/null || exit
-
-if [ ! -f "./server.py" ]; then
-    git init -b main
-    git remote add origin https://github.com/oobabooga/text-generation-webui
-    git fetch
-    git remote set-head origin -a
-    git reset origin/HEAD --hard
-    git branch --set-upstream-to=origin/HEAD
-    git restore -- . :!./CMD_FLAGS.txt
-fi
-
-# copy CMD_FLAGS.txt to install dir to allow edits within Windows
-if [[ $INSTALL_INPLACE != 1 ]]; then
-    # workaround for old install migration
-    if [ ! -f "./wsl.sh" ]; then
-        git pull || exit
-        [ -f "../webui.py" ] && mv "../webui.py" "../webui-old.py"
-    fi
-    if [ -f "$(dirs +1)/CMD_FLAGS.txt" ] && [ -f "./CMD_FLAGS.txt" ]; then cp -u "$(dirs +1)/CMD_FLAGS.txt" "$INSTALL_DIR"; fi
-fi
-
-# setup installer env   update env if called with 'wsl.sh update'
-case "$1" in
-("update-wizard") python one_click.py --update-wizard;;
-(*) python one_click.py $@;;
-esac

From 19c8dced672d6f10ee6214abd364716a18e68d0a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 05:03:23 -0700
Subject: [PATCH 27/49] Move settings-template.yaml into user_data

---
 settings-template.yaml => user_data/settings-template.yaml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename settings-template.yaml => user_data/settings-template.yaml (100%)

diff --git a/settings-template.yaml b/user_data/settings-template.yaml
similarity index 100%
rename from settings-template.yaml
rename to user_data/settings-template.yaml

From cbd4d967cc6216fc0d465ce2d46be43fe7155cd7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 05:09:52 -0700
Subject: [PATCH 28/49] Update a --help message

---
 modules/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/shared.py b/modules/shared.py
index 21e6dd00..6bcbdd46 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -81,7 +81,7 @@ group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to loa
 group.add_argument('--model-dir', type=str, default='user_data/models', help='Path to directory with all the models.')
 group.add_argument('--lora-dir', type=str, default='user_data/loras', help='Path to directory with all the loras.')
 group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
-group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
+group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See user_data/settings-template.yaml for an example. If you create a file called user_data/settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
 group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
 group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
 group.add_argument('--idle-timeout', type=int, default=0, help='Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.')

From 6acb0e1bee1d31dd848056a3a5d0d5ebacc1d1a3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 05:13:08 -0700
Subject: [PATCH 29/49] Change a UI description

---
 modules/ui_session.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_session.py b/modules/ui_session.py
index 42434e51..a3f09821 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -13,7 +13,7 @@ def create_ui():
                 shared.gradio['reset_interface'] = gr.Button("Apply flags/extensions and restart", interactive=not mu)
                 with gr.Row():
                     shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡')
-                    shared.gradio['save_settings'] = gr.Button('Save UI defaults to settings.yaml', interactive=not mu)
+                    shared.gradio['save_settings'] = gr.Button('Save UI defaults to user_data/settings.yaml', interactive=not mu)
 
                 with gr.Row():
                     with gr.Column():

From 3a207e7a57387bd6bc13ed529acb4b72f0de8467 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 07:31:04 -0700
Subject: [PATCH 30/49] Improve the --help formatting a bit

---
 README.md         | 181 +++++++++++++++++++++++-----------------------
 modules/shared.py |   4 +-
 2 files changed, 92 insertions(+), 93 deletions(-)

diff --git a/README.md b/README.md
index 58f77786..a950af53 100644
--- a/README.md
+++ b/README.md
@@ -186,9 +186,9 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--
                  [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
                  [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
                  [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
-                 [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size CTX_SIZE]
+                 [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N]
                  [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
-                 [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner]
+                 [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner]
                  [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
                  [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
                  [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api]
@@ -197,125 +197,124 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--
 Text generation web UI
 
 options:
-  -h, --help                                           show this help message and exit
+  -h, --help                                show this help message and exit
 
 Basic settings:
-  --multi-user                                         Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.
-  --character CHARACTER                                The name of the character to load in chat mode by default.
-  --model MODEL                                        Name of the model to load by default.
-  --lora LORA [LORA ...]                               The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
-  --model-dir MODEL_DIR                                Path to directory with all the models.
-  --lora-dir LORA_DIR                                  Path to directory with all the loras.
-  --model-menu                                         Show a model menu in the terminal when the web UI is first launched.
-  --settings SETTINGS                                  Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml,
-                                                       this file will be loaded by default without the need to use the --settings flag.
-  --extensions EXTENSIONS [EXTENSIONS ...]             The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.
-  --verbose                                            Print the prompts to the terminal.
-  --idle-timeout IDLE_TIMEOUT                          Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
+  --multi-user                              Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.
+  --character CHARACTER                     The name of the character to load in chat mode by default.
+  --model MODEL                             Name of the model to load by default.
+  --lora LORA [LORA ...]                    The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
+  --model-dir MODEL_DIR                     Path to directory with all the models.
+  --lora-dir LORA_DIR                       Path to directory with all the loras.
+  --model-menu                              Show a model menu in the terminal when the web UI is first launched.
+  --settings SETTINGS                       Load the default interface settings from this yaml file. See user_data/settings-template.yaml for an example. If you create a file called
+                                            user_data/settings.yaml, this file will be loaded by default without the need to use the --settings flag.
+  --extensions EXTENSIONS [EXTENSIONS ...]  The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.
+  --verbose                                 Print the prompts to the terminal.
+  --idle-timeout IDLE_TIMEOUT               Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
 
 Model loader:
-  --loader LOADER                                      Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF,
-                                                       ExLlamav2, HQQ, TensorRT-LLM.
+  --loader LOADER                           Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ,
+                                            TensorRT-LLM.
 
 Transformers/Accelerate:
-  --cpu                                                Use the CPU to generate text. Warning: Training on CPU is extremely slow.
-  --cpu-memory CPU_MEMORY                              Maximum CPU memory in GiB. Use this for CPU offloading.
-  --disk                                               If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.
-  --disk-cache-dir DISK_CACHE_DIR                      Directory to save the disk cache to. Defaults to "user_data/cache".
-  --load-in-8bit                                       Load the model with 8-bit precision (using bitsandbytes).
-  --bf16                                               Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
-  --no-cache                                           Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.
-  --trust-remote-code                                  Set trust_remote_code=True while loading the model. Necessary for some models.
-  --force-safetensors                                  Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
-  --no_use_fast                                        Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
-  --use_flash_attention_2                              Set use_flash_attention_2=True while loading the model.
-  --use_eager_attention                                Set attn_implementation= eager while loading the model.
-  --torch-compile                                      Compile the model with torch.compile for improved performance.
+  --cpu                                     Use the CPU to generate text. Warning: Training on CPU is extremely slow.
+  --cpu-memory CPU_MEMORY                   Maximum CPU memory in GiB. Use this for CPU offloading.
+  --disk                                    If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.
+  --disk-cache-dir DISK_CACHE_DIR           Directory to save the disk cache to. Defaults to "user_data/cache".
+  --load-in-8bit                            Load the model with 8-bit precision (using bitsandbytes).
+  --bf16                                    Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
+  --no-cache                                Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.
+  --trust-remote-code                       Set trust_remote_code=True while loading the model. Necessary for some models.
+  --force-safetensors                       Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
+  --no_use_fast                             Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
+  --use_flash_attention_2                   Set use_flash_attention_2=True while loading the model.
+  --use_eager_attention                     Set attn_implementation= eager while loading the model.
+  --torch-compile                           Compile the model with torch.compile for improved performance.
 
 bitsandbytes 4-bit:
-  --load-in-4bit                                       Load the model with 4-bit precision (using bitsandbytes).
-  --use_double_quant                                   use_double_quant for 4-bit.
-  --compute_dtype COMPUTE_DTYPE                        compute dtype for 4-bit. Valid options: bfloat16, float16, float32.
-  --quant_type QUANT_TYPE                              quant_type for 4-bit. Valid options: nf4, fp4.
+  --load-in-4bit                            Load the model with 4-bit precision (using bitsandbytes).
+  --use_double_quant                        use_double_quant for 4-bit.
+  --compute_dtype COMPUTE_DTYPE             compute dtype for 4-bit. Valid options: bfloat16, float16, float32.
+  --quant_type QUANT_TYPE                   quant_type for 4-bit. Valid options: nf4, fp4.
 
 llama.cpp:
-  --flash-attn                                         Use flash-attention.
-  --threads THREADS                                    Number of threads to use.
-  --threads-batch THREADS_BATCH                        Number of threads to use for batches/prompt processing.
-  --batch-size BATCH_SIZE                              Maximum number of prompt tokens to batch together when calling llama_eval.
-  --no-mmap                                            Prevent mmap from being used.
-  --mlock                                              Force the system to keep the model in RAM.
-  --n-gpu-layers N_GPU_LAYERS                          Number of layers to offload to the GPU.
-  --tensor-split TENSOR_SPLIT                          Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
-  --numa                                               Activate NUMA task allocation for llama.cpp.
-  --no-kv-offload                                      Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
-  --row-split                                          Split the model by rows across GPUs. This may improve multi-gpu performance.
-  --extra-flags EXTRA_FLAGS                            Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"
-  --streaming-llm                                      Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
+  --flash-attn                              Use flash-attention.
+  --threads THREADS                         Number of threads to use.
+  --threads-batch THREADS_BATCH             Number of threads to use for batches/prompt processing.
+  --batch-size BATCH_SIZE                   Maximum number of prompt tokens to batch together when calling llama_eval.
+  --no-mmap                                 Prevent mmap from being used.
+  --mlock                                   Force the system to keep the model in RAM.
+  --n-gpu-layers N_GPU_LAYERS               Number of layers to offload to the GPU.
+  --tensor-split TENSOR_SPLIT               Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
+  --numa                                    Activate NUMA task allocation for llama.cpp.
+  --no-kv-offload                           Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
+  --row-split                               Split the model by rows across GPUs. This may improve multi-gpu performance.
+  --extra-flags EXTRA_FLAGS                 Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"
+  --streaming-llm                           Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
 
 Context and cache management:
-  --ctx-size CTX_SIZE, --n_ctx CTX_SIZE, --max_seq_len CTX_SIZE
-                                                       Context size in tokens.
+  --ctx-size N, --n_ctx N, --max_seq_len N  Context size in tokens.
 
 Speculative decoding:
-  --model-draft MODEL_DRAFT                            Path to the draft model for speculative decoding.
-  --draft-max DRAFT_MAX                                Number of tokens to draft for speculative decoding.
-  --gpu-layers-draft GPU_LAYERS_DRAFT                  Number of layers to offload to the GPU for the draft model.
-  --device-draft DEVICE_DRAFT                          Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1
-  --ctx-size-draft CTX_SIZE_DRAFT                      Size of the prompt context for the draft model. If 0, uses the same as the main model.
+  --model-draft MODEL_DRAFT                 Path to the draft model for speculative decoding.
+  --draft-max DRAFT_MAX                     Number of tokens to draft for speculative decoding.
+  --gpu-layers-draft GPU_LAYERS_DRAFT       Number of layers to offload to the GPU for the draft model.
+  --device-draft DEVICE_DRAFT               Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1
+  --ctx-size-draft CTX_SIZE_DRAFT           Size of the prompt context for the draft model. If 0, uses the same as the main model.
 
 ExLlamaV2:
-  --gpu-split GPU_SPLIT                                Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
-  --autosplit                                          Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.
-  --cfg-cache                                          ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
-  --no_flash_attn                                      Force flash-attention to not be used.
-  --no_xformers                                        Force xformers to not be used.
-  --no_sdpa                                            Force Torch SDPA to not be used.
-  --num_experts_per_token NUM_EXPERTS_PER_TOKEN        Number of experts to use for generation. Applies to MoE models like Mixtral.
-  --enable_tp                                          Enable Tensor Parallelism (TP) in ExLlamaV2.
+  --gpu-split GPU_SPLIT                     Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
+  --autosplit                               Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.
+  --cfg-cache                               ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
+  --no_flash_attn                           Force flash-attention to not be used.
+  --no_xformers                             Force xformers to not be used.
+  --no_sdpa                                 Force Torch SDPA to not be used.
+  --num_experts_per_token N                 Number of experts to use for generation. Applies to MoE models like Mixtral.
+  --enable_tp                               Enable Tensor Parallelism (TP) in ExLlamaV2.
 
 HQQ:
-  --hqq-backend HQQ_BACKEND                            Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
+  --hqq-backend HQQ_BACKEND                 Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
 
 TensorRT-LLM:
-  --cpp-runner                                         Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
+  --cpp-runner                              Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
 
 Cache:
-  --cache_type CACHE_TYPE                              KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
+  --cache_type CACHE_TYPE                   KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
 
 DeepSpeed:
-  --deepspeed                                          Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
-  --nvme-offload-dir NVME_OFFLOAD_DIR                  DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
-  --local_rank LOCAL_RANK                              DeepSpeed: Optional argument for distributed setups.
+  --deepspeed                               Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
+  --nvme-offload-dir NVME_OFFLOAD_DIR       DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
+  --local_rank LOCAL_RANK                   DeepSpeed: Optional argument for distributed setups.
 
 RoPE:
-  --alpha_value ALPHA_VALUE                            Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
-  --rope_freq_base ROPE_FREQ_BASE                      If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
-  --compress_pos_emb COMPRESS_POS_EMB                  Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
+  --alpha_value ALPHA_VALUE                 Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
+  --rope_freq_base ROPE_FREQ_BASE           If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
+  --compress_pos_emb COMPRESS_POS_EMB       Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
 
 Gradio:
-  --listen                                             Make the web UI reachable from your local network.
-  --listen-port LISTEN_PORT                            The listening port that the server will use.
-  --listen-host LISTEN_HOST                            The hostname that the server will use.
-  --share                                              Create a public URL. This is useful for running the web UI on Google Colab or similar.
-  --auto-launch                                        Open the web UI in the default browser upon launch.
-  --gradio-auth GRADIO_AUTH                            Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".
-  --gradio-auth-path GRADIO_AUTH_PATH                  Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.
-  --ssl-keyfile SSL_KEYFILE                            The path to the SSL certificate key file.
-  --ssl-certfile SSL_CERTFILE                          The path to the SSL certificate cert file.
-  --subpath SUBPATH                                    Customize the subpath for gradio, use with reverse proxy
-  --old-colors                                         Use the legacy Gradio colors, before the December/2024 update.
+  --listen                                  Make the web UI reachable from your local network.
+  --listen-port LISTEN_PORT                 The listening port that the server will use.
+  --listen-host LISTEN_HOST                 The hostname that the server will use.
+  --share                                   Create a public URL. This is useful for running the web UI on Google Colab or similar.
+  --auto-launch                             Open the web UI in the default browser upon launch.
+  --gradio-auth GRADIO_AUTH                 Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".
+  --gradio-auth-path GRADIO_AUTH_PATH       Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.
+  --ssl-keyfile SSL_KEYFILE                 The path to the SSL certificate key file.
+  --ssl-certfile SSL_CERTFILE               The path to the SSL certificate cert file.
+  --subpath SUBPATH                         Customize the subpath for gradio, use with reverse proxy
+  --old-colors                              Use the legacy Gradio colors, before the December/2024 update.
 
 API:
-  --api                                                Enable the API extension.
-  --public-api                                         Create a public URL for the API using Cloudfare.
-  --public-api-id PUBLIC_API_ID                        Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.
-  --api-port API_PORT                                  The listening port for the API.
-  --api-key API_KEY                                    API authentication key.
-  --admin-key ADMIN_KEY                                API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.
-  --api-enable-ipv6                                    Enable IPv6 for the API
-  --api-disable-ipv4                                   Disable IPv4 for the API
-  --nowebui                                            Do not launch the Gradio UI. Useful for launching the API in standalone mode.
+  --api                                     Enable the API extension.
+  --public-api                              Create a public URL for the API using Cloudfare.
+  --public-api-id PUBLIC_API_ID             Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.
+  --api-port API_PORT                       The listening port for the API.
+  --api-key API_KEY                         API authentication key.
+  --admin-key ADMIN_KEY                     API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.
+  --api-enable-ipv6                         Enable IPv6 for the API
+  --api-disable-ipv4                        Disable IPv4 for the API
+  --nowebui                                 Do not launch the Gradio UI. Useful for launching the API in standalone mode.
 ```
 
 </details>
diff --git a/modules/shared.py b/modules/shared.py
index 6bcbdd46..63bdb536 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -131,7 +131,7 @@ group.add_argument('--streaming-llm', action='store_true', help='Activate Stream
 
 # Cache
 group = parser.add_argument_group('Context and cache management')
-group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, help='Context size in tokens.')
+group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
 group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
 
 # Speculative decoding
@@ -150,7 +150,7 @@ group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Creat
 group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
 group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
 group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
-group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
+group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
 group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
 
 # HQQ

From d1e7d9c5d5e083f7fd3c4ce4915a9d995bdc182a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 09:00:56 -0700
Subject: [PATCH 31/49] Update CMD_FLAGS.txt

---
 user_data/CMD_FLAGS.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/user_data/CMD_FLAGS.txt b/user_data/CMD_FLAGS.txt
index c2d63d9e..b0f667b0 100644
--- a/user_data/CMD_FLAGS.txt
+++ b/user_data/CMD_FLAGS.txt
@@ -1,3 +1,3 @@
-# Only used by the one-click installer.
+# Add persistent flags here to use every time you launch the web UI.
 # Example:
 # --listen --api

From a317450dfa62df456c5bae2463954a3e77dd445b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 14:59:29 -0700
Subject: [PATCH 32/49] Update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a950af53..2f92ed06 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
 
 To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
 
-You can use command-line flags, like `./start_linux.sh --help`, or add them to `CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
+You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
 
 <details>
 <summary>
@@ -157,7 +157,7 @@ mkdir -p logs cache
 #   TORCH_CUDA_ARCH_LIST based on your GPU model
 #   APP_RUNTIME_GID      your host user's group id (run `id -g` in a terminal)
 #   BUILD_EXTENIONS      optionally add comma separated list of extensions to build
-# Edit CMD_FLAGS.txt and add in it the options you want to execute (like --listen --cpu)
+# Edit user_data/CMD_FLAGS.txt and add in it the options you want to execute (like --listen --cpu)
 # 
 docker compose up --build
 ```

From bc55feaf3e528951296e9eaacb66d63e7b14a171 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 15:07:35 -0700
Subject: [PATCH 33/49] Improve host header validation in local mode

---
 extensions/openai/script.py | 14 ++++++++++++++
 modules/gradio_hijack.py    | 25 ++++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index f907cdbb..0a887de2 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -86,6 +86,20 @@ app.add_middleware(
 )
 
 
+@app.middleware("http")
+async def validate_host_header(request: Request, call_next):
+    # Be strict about only approving access to localhost by default
+    if not (shared.args.listen or shared.args.public_api):
+        host = request.headers.get("host", "").split(":")[0]
+        if host not in ["localhost", "127.0.0.1"]:
+            return JSONResponse(
+                status_code=400,
+                content={"detail": "Invalid host header"}
+            )
+
+    return await call_next(request)
+
+
 @app.options("/", dependencies=check_key)
 async def options_route():
     return JSONResponse(content="OK")
diff --git a/modules/gradio_hijack.py b/modules/gradio_hijack.py
index 2ddd983a..8e3bb0d9 100644
--- a/modules/gradio_hijack.py
+++ b/modules/gradio_hijack.py
@@ -1,5 +1,6 @@
 '''
-Copied from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14184
+Most of the code here was adapted from:
+https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14184
 '''
 
 import inspect
@@ -7,6 +8,28 @@ import warnings
 from functools import wraps
 
 import gradio as gr
+import gradio.routes
+from starlette.middleware.trustedhost import TrustedHostMiddleware
+
+from modules import shared
+
+orig_create_app = gradio.routes.App.create_app
+
+
+# Be strict about only approving access to localhost by default
+def create_app_with_trustedhost(*args, **kwargs):
+    app = orig_create_app(*args, **kwargs)
+
+    if not (shared.args.listen or shared.args.share):
+        app.add_middleware(
+            TrustedHostMiddleware,
+            allowed_hosts=["localhost", "127.0.0.1"]
+        )
+
+    return app
+
+
+gradio.routes.App.create_app = create_app_with_trustedhost
 
 
 class GradioDeprecationWarning(DeprecationWarning):

From 35717a088c0487b774fdf6550aaca68476ffe135 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 15:42:27 -0700
Subject: [PATCH 34/49] API: Add an /v1/internal/health endpoint

---
 extensions/openai/script.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 0a887de2..c2dc337b 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -250,6 +250,11 @@ async def handle_moderations(request: Request):
     return JSONResponse(response)
 
 
+@app.get("/v1/internal/health", dependencies=check_key)
+async def handle_health_check():
+    return JSONResponse(content={"status": "ok"})
+
+
 @app.post("/v1/internal/encode", response_model=EncodeResponse, dependencies=check_key)
 async def handle_token_encode(request_data: EncodeRequest):
     response = token_encode(request_data.text)

From 029aab64048a6297b1710933db27beee072bc2fa Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 16:38:13 -0700
Subject: [PATCH 35/49] Revert "Add `-noavx2` portable builds"

This reverts commit 0dd71e78c965289c5bfccb3cbc36183f04be23c6.
---
 .github/workflows/build-portable-release-cuda.yml   |  8 ++------
 .github/workflows/build-portable-release-vulkan.yml |  8 ++------
 .github/workflows/build-portable-release.yml        | 12 ++++--------
 3 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml
index aacd59f9..fb9e61b0 100644
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@@ -59,7 +59,7 @@ jobs:
           $matrix = @{
               'os' = @('ubuntu-22.04', 'windows-2019')
               'pyver' = @("3.11")
-              'avx' = @("AVX2", "AVX")
+              'avx' = @("AVX2")
               'cuda' = @("11.7", "12.4")
           }
 
@@ -161,11 +161,7 @@ jobs:
             # 6. Create ZIP file
             cd ..
             VERSION_CLEAN="${VERSION#v}"
-            if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
-                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
-            else
-                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}-noavx2.zip"
-            fi
+            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
             echo "Creating archive: $ZIP_NAME"
 
             if [[ "$RUNNER_OS" == "Windows" ]]; then
diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml
index 6f1e5ec8..3724e384 100644
--- a/.github/workflows/build-portable-release-vulkan.yml
+++ b/.github/workflows/build-portable-release-vulkan.yml
@@ -59,7 +59,7 @@ jobs:
           $matrix = @{
               'os' = @('ubuntu-22.04', 'windows-2019')
               'pyver' = @("3.11")
-              'avx' = @("AVX2", "AVX")
+              'avx' = @("AVX2")
           }
 
           if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
@@ -146,11 +146,7 @@ jobs:
             # 6. Create ZIP file
             cd ..
             VERSION_CLEAN="${VERSION#v}"
-            if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
-                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-vulkan.zip"
-            else
-                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-vulkan-noavx2.zip"
-            fi
+            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-vulkan.zip"
             echo "Creating archive: $ZIP_NAME"
 
             if [[ "$RUNNER_OS" == "Windows" ]]; then
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index af886652..bdf96cec 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -15,7 +15,7 @@ on:
         type: string
       exclude:
         description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
-        default: 'os:macos-13,avx:AVX;os:macos-14,avx:AVX'
+        default: 'None'
         required: false
         type: string
   workflow_call:
@@ -32,7 +32,7 @@ on:
         type: string
       exclude:
         description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
-        default: 'os:macos-13,avx:AVX;os:macos-14,avx:AVX'
+        default: 'None'
         required: false
         type: string
 
@@ -59,7 +59,7 @@ jobs:
           $matrix = @{
               'os' = @('ubuntu-22.04', 'windows-2019', 'macos-13', 'macos-14')
               'pyver' = @("3.11")
-              'avx' = @("AVX2", "AVX")
+              'avx' = @("AVX2")
           }
 
           if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
@@ -171,11 +171,7 @@ jobs:
             # 5. Create ZIP file
             cd ..
             VERSION_CLEAN="${VERSION#v}"
-            if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
-                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip"
-            else
-                ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-noavx2.zip"
-            fi
+            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip"
             echo "Creating archive: $ZIP_NAME"
 
             if [[ "$RUNNER_OS" == "Windows" ]]; then

From bf2aa19b2155b4fc611475eafc0e1abaa213281e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 16:39:22 -0700
Subject: [PATCH 36/49] Bump llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 2 +-
 requirements/full/requirements_amd_noavx2.txt          | 2 +-
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_amd.txt             | 2 +-
 requirements/portable/requirements_amd_noavx2.txt      | 2 +-
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 18 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index e30e5de7..c20c161e 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 98df8e8a..437da5b5 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -29,6 +29,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 9307d733..b1c87990 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -29,6 +29,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 414e1885..e62987b0 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 5627b38e..f7a9f114 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -29,8 +29,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 07845af2..b8cd8390 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index ffa2b6eb..3b52d59b 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 646aaaaf..a04e8979 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index fcd221e1..e128b285 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 3564b3bf..fd79677c 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -15,4 +15,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt
index 98afdc67..431b50a1 100644
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@@ -15,4 +15,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 433c1f17..f68de3eb 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 9d9c6852..fa1e9014 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -15,6 +15,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index dd25a9d5..22b3adf4 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index dd7f740c..e266794a 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 7542e897..14f6fbbb 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 4810bd50..0a619193 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index d1787e68..997bc1d9 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.5.0/llama_cpp_binaries-0.5.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From 4ff91b6588deb358a802fb28caee189fa442785f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 17:24:40 -0700
Subject: [PATCH 37/49] Better default settings for Speculative Decoding

---
 modules/shared.py        | 3 +--
 modules/ui_model_menu.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 63bdb536..a2ff61e2 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -14,7 +14,6 @@ from modules.logging_colors import logger
 model = None
 tokenizer = None
 model_name = 'None'
-draft_model_name = 'None'
 is_seq2seq = False
 model_dirty_from_training = False
 lora_names = []
@@ -138,7 +137,7 @@ group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type
 group = parser.add_argument_group('Speculative decoding')
 group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.')
 group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to draft for speculative decoding.')
-group.add_argument('--gpu-layers-draft', type=int, default=0, help='Number of layers to offload to the GPU for the draft model.')
+group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of layers to offload to the GPU for the draft model.')
 group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
 group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
 
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index dc09c899..546200f9 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -96,7 +96,7 @@ def create_ui():
                             # Speculative decoding
                             with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
                                 with gr.Row():
-                                    shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.draft_model_name, elem_classes='slim-dropdown', interactive=not mu)
+                                    shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', interactive=not mu)
                                     ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
 
                                 shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')

From 234aba1c50374dee8d342d4df12e4de9428a544f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 17:33:47 -0700
Subject: [PATCH 38/49] llama.cpp: Simplify the prompt processing progress
 indicator

The progress bar was unreliable
---
 modules/llama_cpp_server.py | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index fb972a32..895f3b1e 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -11,7 +11,6 @@ from pathlib import Path
 
 import llama_cpp_binaries
 import requests
-from tqdm import tqdm
 
 from modules import shared
 from modules.logging_colors import logger
@@ -391,31 +390,15 @@ class LlamaServer:
 
 
 def filter_stderr_with_progress(process_stderr):
-    progress_bar = None
     progress_pattern = re.compile(r'slot update_slots: id.*progress = (\d+\.\d+)')
-
     try:
         for line in iter(process_stderr.readline, ''):
             progress_match = progress_pattern.search(line)
-
             if progress_match:
-                progress = float(progress_match.group(1))
-
-                # Create progress bar on first progress message
-                if progress_bar is None:
-                    progress_bar = tqdm(total=1.0, desc="Prompt Processing", leave=False)
-
-                progress_bar.update(progress - progress_bar.n)
-
-                # Clean up when complete
-                if progress >= 1.0:
-                    progress_bar.close()
-                    progress_bar = None
-
-            if not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line:
+                sys.stderr.write(line)
+                sys.stderr.flush()
+            elif not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line:
                 sys.stderr.write(line)
                 sys.stderr.flush()
     except (ValueError, IOError):
-        if progress_bar:
-            progress_bar.close()
         pass

From c4afc0421d9141610e059007f41592b2223d9aae Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 17:43:53 -0700
Subject: [PATCH 39/49] Fix parsing of --n_ctx and --max_seq_len

---
 modules/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/shared.py b/modules/shared.py
index a2ff61e2..2a08db2b 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -130,7 +130,7 @@ group.add_argument('--streaming-llm', action='store_true', help='Activate Stream
 
 # Cache
 group = parser.add_argument_group('Context and cache management')
-group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
+group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', dest='ctx_size', help='Context size in tokens.')
 group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
 
 # Speculative decoding

From 0fe3b033d01efd2dfb62c1057a101192c779956a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 17:52:21 -0700
Subject: [PATCH 40/49] Fix parsing of --n_ctx and --max_seq_len (2nd attempt)

---
 modules/shared.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 2a08db2b..7a3368f5 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -130,7 +130,7 @@ group.add_argument('--streaming-llm', action='store_true', help='Activate Stream
 
 # Cache
 group = parser.add_argument_group('Context and cache management')
-group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', dest='ctx_size', help='Context size in tokens.')
+group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
 group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
 
 # Speculative decoding
@@ -222,10 +222,19 @@ if cmd_flags_path.exists():
 
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
+
+# Create a mapping of all argument aliases to their canonical names
+alias_to_dest = {}
+for action in parser._actions:
+    for opt in action.option_strings:
+        alias_to_dest[opt.lstrip('-').replace('-', '_')] = action.dest
+
 provided_arguments = []
 for arg in sys.argv[1:]:
     arg = arg.lstrip('-').replace('-', '_')
-    if hasattr(args, arg):
+    if arg in alias_to_dest:
+        provided_arguments.append(alias_to_dest[arg])
+    elif hasattr(args, arg):
         provided_arguments.append(arg)
 
 

From 4a32e1f80caa26445aded2f7856569c03d31be98 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 18:01:33 -0700
Subject: [PATCH 41/49] UI: show draft_max for ExLlamaV2

---
 modules/loaders.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/loaders.py b/modules/loaders.py
index 062e4837..b8ae82d7 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -86,6 +86,7 @@ loaders_and_params = OrderedDict({
         'no_sdpa',
         'exllamav2_info',
         'model_draft',
+        'draft_max',
         'ctx_size_draft',
         'speculative_decoding_accordion',
     ],

From 8b83e6f843767d6423a9017e37c4fdbd6c7cb3ac Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 18:14:57 -0700
Subject: [PATCH 42/49] Prevent Gradio from saying 'Thank you for being a
 Gradio user!'

---
 modules/gradio_hijack.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/gradio_hijack.py b/modules/gradio_hijack.py
index 8e3bb0d9..817da40c 100644
--- a/modules/gradio_hijack.py
+++ b/modules/gradio_hijack.py
@@ -9,6 +9,7 @@ from functools import wraps
 
 import gradio as gr
 import gradio.routes
+import gradio.utils
 from starlette.middleware.trustedhost import TrustedHostMiddleware
 
 from modules import shared
@@ -30,6 +31,7 @@ def create_app_with_trustedhost(*args, **kwargs):
 
 
 gradio.routes.App.create_app = create_app_with_trustedhost
+gradio.utils.launch_counter = lambda: None
 
 
 class GradioDeprecationWarning(DeprecationWarning):

From 511eb6aa94965fcfbc3d39ae1c62c7069ba3e4f2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 18:20:00 -0700
Subject: [PATCH 43/49] Fix saving settings to settings.yaml

---
 modules/ui_session.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_session.py b/modules/ui_session.py
index a3f09821..7cf9f6e6 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -48,7 +48,7 @@ def handle_save_settings(state, preset, extensions, show_controls, theme):
     return [
         contents,
         "settings.yaml",
-        "./user_data",
+        "user_data/",
         gr.update(visible=True)
     ]
 

From 943451284fa7ba28f9768a9efb06e0e5736c8714 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 18:25:06 -0700
Subject: [PATCH 44/49] Fix the Notebook tab not loading its default prompt

---
 modules/ui.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui.py b/modules/ui.py
index ef5ed0e6..f137e62d 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -266,7 +266,7 @@ def apply_interface_values(state, use_persistent=False):
         if 'textbox-default' in state and 'prompt_menu-default' in state:
             state.pop('prompt_menu-default')
 
-        if 'textbox-notebook' and 'prompt_menu-notebook' in state:
+        if 'textbox-notebook' in state and 'prompt_menu-notebook' in state:
             state.pop('prompt_menu-notebook')
 
     elements = list_interface_input_elements()

From 7b80acd524a1a37c007a86bc9404a39a9c188252 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 18:40:03 -0700
Subject: [PATCH 45/49] Fix parsing --extra-flags

---
 modules/llama_cpp_server.py | 2 +-
 modules/shared.py           | 2 +-
 modules/ui_model_menu.py    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 895f3b1e..9572d5aa 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -312,7 +312,7 @@ class LlamaServer:
             elif extra_flags.startswith("'") and extra_flags.endswith("'"):
                 extra_flags = extra_flags[1:-1].strip()
 
-            for flag_item in extra_flags.split(';'):
+            for flag_item in extra_flags.split(','):
                 if '=' in flag_item:
                     flag, value = flag_item.split('=', 1)
                     cmd += [f"--{flag}", value]
diff --git a/modules/shared.py b/modules/shared.py
index 7a3368f5..5d9dd362 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -125,7 +125,7 @@ group.add_argument('--tensor-split', type=str, default=None, help='Split the mod
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
 group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
-group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
+group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"')
 group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
 
 # Cache
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 546200f9..e3cf2ba6 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -55,7 +55,7 @@ def create_ui():
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                            shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
+                            shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
                             shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
                             shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
                             shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')

From fa861de05b478260dfc129d0a8eacba0a274b5ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 18:52:44 -0700
Subject: [PATCH 46/49] Fix portable builds with Python 3.12

---
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_amd.txt             | 2 +-
 requirements/portable/requirements_amd_noavx2.txt      | 2 +-
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 10 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index e128b285..5c717343 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index fd79677c..b616193d 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -15,4 +15,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt
index 431b50a1..de4740c9 100644
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@@ -15,4 +15,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index f68de3eb..6310327d 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index fa1e9014..f69b58e7 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -15,6 +15,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 22b3adf4..dafa6bbe 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index e266794a..c02191eb 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 14f6fbbb..456188b4 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 0a619193..7e733967 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 997bc1d9..0329a598 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 363b632a0db8315ddc4223d4989df7757d80d66b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 19:22:36 -0700
Subject: [PATCH 47/49] Lint

---
 css/main.css | 70 ++++++++++++++++++++++++++--------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/css/main.css b/css/main.css
index 9dce4d0e..6e329b9a 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1311,74 +1311,74 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 /* Thinking blocks styling */
 .thinking-block {
-  margin-bottom: 12px;
-  border-radius: 8px;
-  border: 1px solid rgba(0, 0, 0, 0.1);
-  background-color: var(--light-theme-gray);
-  overflow: hidden;
+    margin-bottom: 12px;
+    border-radius: 8px;
+    border: 1px solid rgb(0 0 0 / 10%);
+    background-color: var(--light-theme-gray);
+    overflow: hidden;
 }
 
 .dark .thinking-block {
-  background-color: var(--darker-gray);
+    background-color: var(--darker-gray);
 }
 
 .thinking-header {
-  display: flex;
-  align-items: center;
-  padding: 10px 16px;
-  cursor: pointer;
-  user-select: none;
-  font-size: 14px;
-  color: rgba(0, 0, 0, 0.7);
-  transition: background-color 0.2s;
+    display: flex;
+    align-items: center;
+    padding: 10px 16px;
+    cursor: pointer;
+    user-select: none;
+    font-size: 14px;
+    color: rgb(0 0 0 / 70%);
+    transition: background-color 0.2s;
 }
 
 .thinking-header:hover {
-  background-color: rgba(0, 0, 0, 0.03);
+    background-color: rgb(0 0 0 / 3%);
 }
 
 .thinking-header::-webkit-details-marker {
-  display: none;
+    display: none;
 }
 
 .thinking-icon {
-  margin-right: 8px;
-  color: rgba(0, 0, 0, 0.5);
+    margin-right: 8px;
+    color: rgb(0 0 0 / 50%);
 }
 
 .thinking-title {
-  font-weight: 500;
+    font-weight: 500;
 }
 
 .thinking-content {
-  padding: 12px 16px;
-  border-top: 1px solid rgba(0, 0, 0, 0.07);
-  color: rgba(0, 0, 0, 0.7);
-  font-size: 14px;
-  line-height: 1.5;
-  overflow-wrap: break-word;
-  max-height: 300px;
-  overflow-y: scroll;
-  contain: layout;
+    padding: 12px 16px;
+    border-top: 1px solid rgb(0 0 0 / 7%);
+    color: rgb(0 0 0 / 70%);
+    font-size: 14px;
+    line-height: 1.5;
+    overflow-wrap: break-word;
+    max-height: 300px;
+    overflow-y: scroll;
+    contain: layout;
 }
 
 /* Animation for opening thinking blocks */
 @keyframes fadeIn {
-  from { opacity: 0; }
-  to { opacity: 1; }
+    from { opacity: 0; }
+    to { opacity: 1; }
 }
 
 .thinking-block[open] .thinking-content {
-  animation: fadeIn 0.3s ease-out;
+    animation: fadeIn 0.3s ease-out;
 }
 
 /* Additional style for in-progress thinking */
 .thinking-block[data-streaming="true"] .thinking-title {
-  animation: pulse 1.5s infinite;
+    animation: pulse 1.5s infinite;
 }
 
 @keyframes pulse {
-  0% { opacity: 0.6; }
-  50% { opacity: 1; }
-  100% { opacity: 0.6; }
+    0% { opacity: 0.6; }
+    50% { opacity: 1; }
+    100% { opacity: 0.6; }
 }

From 70952553c7f3fd9fdab8e752260d18a2fff6d00e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 19:29:08 -0700
Subject: [PATCH 48/49] Lint

---
 modules/exllamav2.py    |  2 +-
 modules/exllamav2_hf.py | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 952b73b8..6bb422ea 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -3,7 +3,6 @@ import traceback
 from pathlib import Path
 
 import torch
-
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Cache,
@@ -16,6 +15,7 @@ from exllamav2 import (
     ExLlamaV2Tokenizer
 )
 from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
+
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
index d6c3bf6e..eb801940 100644
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@@ -4,15 +4,6 @@ from pathlib import Path
 from typing import Any, Dict, Optional, Union
 
 import torch
-from torch.nn import CrossEntropyLoss
-from transformers import (
-    GenerationConfig,
-    GenerationMixin,
-    PretrainedConfig,
-    PreTrainedModel
-)
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Cache,
@@ -23,6 +14,15 @@ from exllamav2 import (
     ExLlamaV2Cache_TP,
     ExLlamaV2Config
 )
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    GenerationConfig,
+    GenerationMixin,
+    PretrainedConfig,
+    PreTrainedModel
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
 from modules import shared
 from modules.logging_colors import logger
 

From 765fea5e36ee8089387c87478b8cedccd9c8b23e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 26 Apr 2025 19:33:46 -0700
Subject: [PATCH 49/49] UI: minor style change

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 6e329b9a..d6e5ac83 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1357,7 +1357,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     font-size: 14px;
     line-height: 1.5;
     overflow-wrap: break-word;
-    max-height: 300px;
+    max-height: 250px;
     overflow-y: scroll;
     contain: layout;
 }