Merge pull request #6645 from oobabooga/dev

Merge dev branch
2026-02-21 23:24:42 +01:00 · 2025-01-09 18:46:28 -03:00 · 2025-01-09 18:46:28 -03:00 · e6eda6a3bb
parent 88a6331abf f3c0f964a2
commit e6eda6a3bb
42 changed files with 835 additions and 527 deletions
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.

 ## Features

- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported but you need to install them manually.
+- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile), and the Transformers loader is compatible with libraries like [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM), but they must be installed manually.
 - OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - Automatic prompt formatting using Jinja2 templates.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@ -9,6 +9,7 @@

 .message-body {
    margin-top: 3px;
+    font-size: 15px !important;
 }

 .circle-you {
--- a/css/main.css
+++ b/css/main.css
@ -226,6 +226,7 @@ button {
    max-width: 500px;
    background-color: var(--input-background-fill);
    border: var(--input-border-width) solid var(--input-border-color) !important;
+    padding: 10px;
 }

 .file-saver > :first-child > :last-child {
@ -499,8 +500,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
    margin-bottom: 0.5em !important;
 }

-.message-body ul.long-list li,
-.message-body ol.long-list li {
+.message-body ul.long-list > li,
+.message-body ol.long-list > li {
    margin-top: 1.25em !important;
    margin-bottom: 1.25em !important;
 }
@ -538,8 +539,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }

 .message-body pre > code {
-    white-space: pre-wrap !important;
-    word-wrap: break-word !important;
+    white-space: pre !important;
+    overflow-x: auto !important;
+    max-width: calc(100dvw - 39px);
    border: 1px solid #666;
    border-radius: 5px;
    font-size: 82%;
@ -838,7 +840,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
  Past chats menus
 ---------------------------------------------- */
 #rename-row label {
-    margin-top: var(--layout-gap);
+    margin-top: 0;
+}
+
+#rename-row > :nth-child(2) {
+    justify-content: center;
 }

 /* ----------------------------------------------
@ -875,6 +881,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
    flex-shrink: 1;
 }

+#search_chat > :nth-child(2) > :first-child {
+    display: none;
+}
+
 /* ----------------------------------------------
  Keep dropdown menus above errored components
 ---------------------------------------------- */
@ -910,7 +920,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }

 #past-chats {
-    max-height: calc(100dvh - 90px);
+    max-height: calc(100dvh - 135px);
    overflow-y: scroll !important;
    border-radius: 0;
    scrollbar-width: auto;
@ -980,6 +990,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 #rename-row {
    width: 100%;
    justify-content: center;
+    gap: 9px;
 }


--- a/docker/amd/Dockerfile
+++ b/docker/amd/Dockerfile
@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=B USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=C LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@ -17,7 +17,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=N USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=N LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime
--- a/docker/intel/Dockerfile
+++ b/docker/intel/Dockerfile
@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=D USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=E LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime
--- a/docker/nvidia/Dockerfile
+++ b/docker/nvidia/Dockerfile
@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui
--- a/download-model.py
+++ b/download-model.py
@ -72,7 +72,7 @@ class ModelDownloader:

        return model, branch

-    def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None):
+    def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None, exclude_pattern=None):
        session = self.session
        page = f"/api/models/{model}/tree/{branch}"
        cursor = b""
@ -100,13 +100,17 @@ class ModelDownloader:
                if specific_file not in [None, ''] and fname != specific_file:
                    continue

+                # Exclude files matching the exclude pattern
+                if exclude_pattern is not None and re.match(exclude_pattern, fname):
+                    continue
+
                if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
                    is_lora = True

                is_pytorch = re.match(r"(pytorch|adapter|gptq)_model.*\.bin", fname)
                is_safetensors = re.match(r".*\.safetensors", fname)
                is_pt = re.match(r".*\.pt", fname)
-                is_gguf = re.match(r'.*\.gguf', fname)
+                is_gguf = re.match(r".*\.gguf", fname)
                is_tiktoken = re.match(r".*\.tiktoken", fname)
                is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname) or is_tiktoken
                is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
@ -140,7 +144,6 @@ class ModelDownloader:

        # If both pytorch and safetensors are available, download safetensors only
        # Also if GGUF and safetensors are available, download only safetensors
-        # (why do people do this?)
        if (has_pytorch or has_pt or has_gguf) and has_safetensors:
            has_gguf = False
            for i in range(len(classifications) - 1, -1, -1):
@ -148,8 +151,6 @@ class ModelDownloader:
                    links.pop(i)

        # For GGUF, try to download only the Q4_K_M if no specific file is specified.
-        # If not present, exclude all GGUFs, as that's likely a repository with both
-        # GGUF and fp16 files.
        if has_gguf and specific_file is None:
            has_q4km = False
            for i in range(len(classifications) - 1, -1, -1):
@ -312,6 +313,7 @@ if __name__ == '__main__':
    parser.add_argument('--threads', type=int, default=4, help='Number of files to download simultaneously.')
    parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
    parser.add_argument('--specific-file', type=str, default=None, help='Name of the specific file to download (if not provided, downloads all).')
+    parser.add_argument('--exclude-pattern', type=str, default=None, help='Regex pattern to exclude files from download.')
    parser.add_argument('--output', type=str, default=None, help='Save the model files to this folder.')
    parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/models).')
    parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
@ -322,6 +324,7 @@ if __name__ == '__main__':
    branch = args.branch
    model = args.MODEL
    specific_file = args.specific_file
+    exclude_pattern = args.exclude_pattern

    if model is None:
        print("Error: Please specify the model you'd like to download (e.g. 'python download-model.py facebook/opt-1.3b').")
@ -336,7 +339,9 @@ if __name__ == '__main__':
        sys.exit()

    # Get the download links from Hugging Face
-    links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=args.text_only, specific_file=specific_file)
+    links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(
+        model, branch, text_only=args.text_only, specific_file=specific_file, exclude_pattern=exclude_pattern
+    )

    # Get the output folder
    if args.output:
@ -349,4 +354,7 @@ if __name__ == '__main__':
        downloader.check_model_files(model, branch, links, sha256, output_folder)
    else:
        # Download files
-        downloader.download_model_files(model, branch, links, sha256, output_folder, specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp)
+        downloader.download_model_files(
+            model, branch, links, sha256, output_folder,
+            specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp
+        )
--- a/extensions/Training_PRO/script.py
+++ b/extensions/Training_PRO/script.py
@ -789,7 +789,11 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
    if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
        logger.info("Getting model ready...")
        # here we can disable gradient checkpoint, by default = true,  use_gradient_checkpointing=True
-        prepare_model_for_kbit_training(shared.model)
+        if 'quantization_config' in shared.model.config.to_dict():
+            print(f"Method: {RED}QLORA{RESET}")
+            prepare_model_for_kbit_training(shared.model)
+        else:
+            print(f"Method: {RED}LoRA{RESET}")    

    # base model is now frozen and should not be reused for any other LoRA training than this one
    shared.model_dirty_from_training = True
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@ -353,23 +353,38 @@ async def handle_unload_loras():


 def run_server():
-    server_addr = '0.0.0.0' if shared.args.listen else '127.0.0.1'
+    # Parse configuration
    port = int(os.environ.get('OPENEDAI_PORT', shared.args.api_port))
-
    ssl_certfile = os.environ.get('OPENEDAI_CERT_PATH', shared.args.ssl_certfile)
    ssl_keyfile = os.environ.get('OPENEDAI_KEY_PATH', shared.args.ssl_keyfile)

+    # In the server configuration:
+    server_addrs = []
+    if os.environ.get('OPENEDAI_ENABLE_IPV6', shared.args.api_enable_ipv6):
+        server_addrs.append('[::]' if shared.args.listen else '[::1]')
+    if not os.environ.get('OPENEDAI_DISABLE_IPV4', shared.args.api_disable_ipv4):
+        server_addrs.append('0.0.0.0' if shared.args.listen else '127.0.0.1')
+
+    if not server_addrs:
+        raise Exception('you MUST enable IPv6 or IPv4 for the API to work')
+
+    # Log server information
    if shared.args.public_api:
-        def on_start(public_url: str):
-            logger.info(f'OpenAI-compatible API URL:\n\n{public_url}\n')
-
-        _start_cloudflared(port, shared.args.public_api_id, max_attempts=3, on_start=on_start)
+        _start_cloudflared(
+            port,
+            shared.args.public_api_id,
+            max_attempts=3,
+            on_start=lambda url: logger.info(f'OpenAI-compatible API URL:\n\n{url}\n')
+        )
    else:
-        if ssl_keyfile and ssl_certfile:
-            logger.info(f'OpenAI-compatible API URL:\n\nhttps://{server_addr}:{port}\n')
+        url_proto = 'https://' if (ssl_certfile and ssl_keyfile) else 'http://'
+        urls = [f'{url_proto}{addr}:{port}' for addr in server_addrs]
+        if len(urls) > 1:
+            logger.info('OpenAI-compatible API URLs:\n\n' + '\n'.join(urls) + '\n')
        else:
-            logger.info(f'OpenAI-compatible API URL:\n\nhttp://{server_addr}:{port}\n')
+            logger.info('OpenAI-compatible API URL:\n\n' + '\n'.join(urls) + '\n')

+    # Log API keys
    if shared.args.api_key:
        if not shared.args.admin_key:
            shared.args.admin_key = shared.args.api_key
@ -379,8 +394,9 @@ def run_server():
    if shared.args.admin_key and shared.args.admin_key != shared.args.api_key:
        logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')

+    # Start server
    logging.getLogger("uvicorn.error").propagate = False
-    uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
+    uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)


 def setup():
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@ -42,6 +42,7 @@ class GenerationOptions(BaseModel):
    truncation_length: int = 0
    max_tokens_second: int = 0
    prompt_lookup_num_tokens: int = 0
+    static_cache: bool = False
    custom_token_bans: str = ""
    sampler_priority: List[str] | str | None = Field(default=None, description="List of samplers where the first items will appear first in the stack. Example: [\"top_k\", \"temperature\", \"top_p\"].")
    auto_max_new_tokens: bool = False
--- a/js/main.js
+++ b/js/main.js
@ -446,25 +446,33 @@ function toggleBigPicture() {
 //------------------------------------------------
 // Handle the chat input box growth
 //------------------------------------------------
-let currentChatInputHeight = 0;
+
+// Cache DOM elements
+const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
+const chatInput = document.querySelector("#chat-input textarea");
+
+// Variables to store current dimensions
+let currentChatInputHeight = chatInput.clientHeight;

 // Update chat layout based on chat and input dimensions
 function updateCssProperties() {
-  const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
-  const chatInputHeight = document.querySelector("#chat-input textarea").clientHeight;
+  const chatInputHeight = chatInput.clientHeight;

  // Check if the chat container is visible
  if (chatContainer.clientHeight > 0) {
-    const newChatHeight = `${chatContainer.parentNode.clientHeight - chatInputHeight + 40 - 100 - 20}px`;
+    const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
+    const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
+
    document.documentElement.style.setProperty("--chat-height", newChatHeight);
    document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);

    // Adjust scrollTop based on input height change
    if (chatInputHeight !== currentChatInputHeight) {
-      if (!isScrolled && chatInputHeight < currentChatInputHeight) {
+      const deltaHeight = chatInputHeight - currentChatInputHeight;
+      if (!isScrolled && deltaHeight < 0) {
        chatContainer.scrollTop = chatContainer.scrollHeight;
      } else {
-        chatContainer.scrollTop += chatInputHeight - currentChatInputHeight;
+        chatContainer.scrollTop += deltaHeight;
      }

      currentChatInputHeight = chatInputHeight;
--- a/modules/AutoGPTQ_loader.py
+++ b/modules/AutoGPTQ_loader.py
@ -1,74 +0,0 @@
-from pathlib import Path
-
-from accelerate.utils import is_xpu_available
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-
-import modules.shared as shared
-from modules.logging_colors import logger
-from modules.models import get_max_memory_dict
-
-
-def load_quantized(model_name):
-    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
-    pt_path = None
-
-    # Find the model checkpoint
-    if shared.args.checkpoint:
-        pt_path = Path(shared.args.checkpoint)
-    else:
-        for ext in ['.safetensors', '.pt', '.bin']:
-            found = list(path_to_model.glob(f"*{ext}"))
-            if len(found) > 0:
-                if len(found) > 1:
-                    logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
-
-                pt_path = found[-1]
-                break
-
-    if pt_path is None:
-        logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")
-        return
-
-    use_safetensors = pt_path.suffix == '.safetensors'
-    if not (path_to_model / "quantize_config.json").exists():
-        quantize_config = BaseQuantizeConfig(
-            bits=bits if (bits := shared.args.wbits) > 0 else 4,
-            group_size=gs if (gs := shared.args.groupsize) > 0 else -1,
-            desc_act=shared.args.desc_act
-        )
-    else:
-        quantize_config = None
-
-    # Define the params for AutoGPTQForCausalLM.from_quantized
-    params = {
-        'model_basename': pt_path.stem,
-        'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
-        'use_triton': shared.args.triton,
-        'inject_fused_attention': False,
-        'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
-        'use_safetensors': use_safetensors,
-        'trust_remote_code': shared.args.trust_remote_code,
-        'max_memory': get_max_memory_dict(),
-        'quantize_config': quantize_config,
-        'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
-        'disable_exllama': shared.args.disable_exllama,
-        'disable_exllamav2': shared.args.disable_exllamav2,
-    }
-
-    logger.info(f"The AutoGPTQ params are: {params}")
-    model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
-
-    # These lines fix the multimodal extension when used with AutoGPTQ
-    if hasattr(model, 'model'):
-        if not hasattr(model, 'dtype'):
-            if hasattr(model.model, 'dtype'):
-                model.dtype = model.model.dtype
-
-        if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'):
-            if not hasattr(model, 'embed_tokens'):
-                model.embed_tokens = model.model.model.embed_tokens
-
-            if not hasattr(model.model, 'embed_tokens'):
-                model.model.embed_tokens = model.model.model.embed_tokens
-
-    return model
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@ -1,17 +1,12 @@
 from pathlib import Path

-import torch
-from transformers import is_torch_xpu_available
-
 import modules.shared as shared
 from modules.logging_colors import logger
-from modules.models import reload_model
+from modules.models import get_device


 def add_lora_to_model(lora_names):
-    if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ':
-        add_lora_autogptq(lora_names)
-    elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
+    if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
        add_lora_exllamav2(lora_names)
    else:
        add_lora_transformers(lora_names)
@ -51,38 +46,6 @@ def add_lora_exllamav2(lora_names):
        shared.model.loras = None


-def add_lora_autogptq(lora_names):
-    '''
-    Adapted from https://github.com/Ph0rk0z/text-generation-webui-testing
-    '''
-
-    try:
-        from auto_gptq import get_gptq_peft_model
-        from auto_gptq.utils.peft_utils import GPTQLoraConfig
-    except:
-        logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.")
-        return
-
-    if len(lora_names) == 0:
-        reload_model()
-
-        shared.lora_names = []
-        return
-    else:
-        if len(lora_names) > 1:
-            logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
-
-        peft_config = GPTQLoraConfig(
-            inference_mode=True,
-        )
-
-        lora_path = get_lora_path(lora_names[0])
-        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
-        shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
-        shared.lora_names = [lora_names[0]]
-        return
-
-
 def add_lora_transformers(lora_names):

    from peft import PeftModel
@ -132,14 +95,9 @@ def add_lora_transformers(lora_names):
        if not shared.args.load_in_8bit and not shared.args.cpu:
            shared.model.half()
            if not hasattr(shared.model, "hf_device_map"):
-                if torch.backends.mps.is_available():
-                    device = torch.device('mps')
+                device = get_device()
+                if device:
                    shared.model = shared.model.to(device)
-                elif is_torch_xpu_available():
-                    device = torch.device("xpu:0")
-                    shared.model = shared.model.to(device)
-                else:
-                    shared.model = shared.model.cuda()

    shared.lora_names = lora_names

--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@ -1,11 +1,9 @@
-import gc
 import traceback
 from queue import Queue
 from threading import Thread

 import torch
 import transformers
-from transformers import is_torch_npu_available, is_torch_xpu_available

 import modules.shared as shared

@ -65,7 +63,6 @@ class Iteratorize:
                traceback.print_exc()
                pass

-            clear_torch_cache()
            self.q.put(self.sentinel)
            if self.c_callback:
                self.c_callback(ret)
@ -84,22 +81,10 @@ class Iteratorize:
            return obj

    def __del__(self):
-        clear_torch_cache()
+        pass

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.stop_now = True
-        clear_torch_cache()
-
-
-def clear_torch_cache():
-    gc.collect()
-    if not shared.args.cpu:
-        if is_torch_xpu_available():
-            torch.xpu.empty_cache()
-        elif is_torch_npu_available():
-            torch.npu.empty_cache()
-        else:
-            torch.cuda.empty_cache()
--- a/modules/chat.py
+++ b/modules/chat.py
@ -593,21 +593,26 @@ def find_all_histories_with_first_prompts(state):
    result = []
    for i, path in enumerate(histories):
        filename = path.stem
-        if re.match(r'^[0-9]{8}-[0-9]{2}-[0-9]{2}-[0-9]{2}$', filename):
-            with open(path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
+        file_content = ""
+        with open(path, 'r', encoding='utf-8') as f:
+            file_content = f.read()

-                first_prompt = ""
-                if data and 'visible' in data and len(data['visible']) > 0:
-                    if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
-                        if len(data['visible']) > 1:
-                            first_prompt = html.unescape(data['visible'][1][0])
-                        elif i == 0:
-                            first_prompt = "New chat"
-                    else:
-                        first_prompt = html.unescape(data['visible'][0][0])
-                elif i == 0:
-                    first_prompt = "New chat"
+        if state['search_chat'] and state['search_chat'] not in file_content:
+            continue
+
+        data = json.loads(file_content)
+        if re.match(r'^[0-9]{8}-[0-9]{2}-[0-9]{2}-[0-9]{2}$', filename):
+            first_prompt = ""
+            if data and 'visible' in data and len(data['visible']) > 0:
+                if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>':
+                    if len(data['visible']) > 1:
+                        first_prompt = html.unescape(data['visible'][1][0])
+                    elif i == 0:
+                        first_prompt = "New chat"
+                else:
+                    first_prompt = html.unescape(data['visible'][0][0])
+            elif i == 0:
+                first_prompt = "New chat"
        else:
            first_prompt = filename

@ -615,7 +620,7 @@ def find_all_histories_with_first_prompts(state):

        # Truncate the first prompt if it's longer than 30 characters
        if len(first_prompt) > 30:
-            first_prompt = first_prompt[:30-3] + '...'
+            first_prompt = first_prompt[:30 - 3] + '...'

        result.append((first_prompt, filename))

@ -1092,6 +1097,21 @@ def handle_delete_chat_confirm_click(state):
    ]


+def handle_branch_chat_click(state):
+    history = state['history']
+    new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
+    save_history(history, new_unique_id, state['character_menu'], state['mode'])
+
+    histories = find_all_histories_with_first_prompts(state)
+    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+
+    convert_to_markdown.cache_clear()
+
+    past_chats_update = gr.update(choices=histories, value=new_unique_id)
+
+    return [history, html, past_chats_update]
+
+
 def handle_rename_chat_click():
    return [
        gr.update(value="My New Chat"),
@ -1109,6 +1129,11 @@ def handle_rename_chat_confirm(rename_to, state):
    ]


+def handle_search_chat_change(state):
+    histories = find_all_histories_with_first_prompts(state)
+    return gr.update(choices=histories)
+
+
 def handle_upload_chat_history(load_chat_history, state):
    history = start_new_chat(state)
    history = load_history_json(load_chat_history, history)
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@ -1,8 +1,8 @@
+import json
 import traceback
 from pathlib import Path

 import torch
-
 from exllamav2 import (
    ExLlamaV2,
    ExLlamaV2Cache,
@ -15,6 +15,7 @@ from exllamav2 import (
    ExLlamaV2Tokenizer
 )
 from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
+
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
@ -122,6 +123,10 @@ class Exllamav2Model:
        settings.token_presence_penalty = state['presence_penalty']

        settings.temperature = state['temperature']
+        settings.smoothing_factor = state['smoothing_factor']
+        settings.min_temp = state['dynatemp_low'] if state['dynamic_temperature'] else 0
+        settings.max_temp = state['dynatemp_high'] if state['dynamic_temperature'] else 0
+        settings.temp_exponent = state['dynatemp_exponent']
        settings.top_k = state['top_k']
        settings.top_p = state['top_p']
        settings.top_a = state['top_a']
@ -143,6 +148,29 @@ class Exllamav2Model:
            if len(to_ban) > 0:
                settings.disallow_tokens(self.tokenizer, to_ban)

+        settings.dry_allowed_length = state['dry_allowed_length']
+        settings.dry_base = state['dry_base']
+        settings.dry_multiplier = state['dry_multiplier']
+
+        # Dry sequence breakers processing
+        if state['dry_multiplier'] > 0 and state['dry_sequence_breakers']:
+            dry_sequence_breakers = state['dry_sequence_breakers']
+
+            # Support both JSON array notation and comma-separated strings.
+            if not dry_sequence_breakers.startswith("["):
+                dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
+
+            sequence_breaker_strings = json.loads(dry_sequence_breakers)
+            # Prefix with 'a' to get the correct encoding of the token at the end of a text.
+            sequence_breakers = {
+                self.encode(f"a{s}")[0, -1].item() for s in sequence_breaker_strings
+            }
+
+            settings.dry_sequence_breakers = sequence_breakers
+
+        settings.xtc_probability = state['xtc_probability']
+        settings.xtc_threshold = state['xtc_threshold']
+
        ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True)
        ids = ids[:, -get_max_prompt_length(state):]

--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@ -9,11 +9,35 @@ import markdown
 from PIL import Image, ImageOps

 from modules import shared
+from modules.sane_markdown_lists import SaneListExtension
 from modules.utils import get_available_chat_styles

 # This is to store the paths to the thumbnails of the profile pictures
 image_cache = {}

+
+def minify_css(css: str) -> str:
+    # Step 1: Remove comments
+    css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)
+
+    # Step 2: Remove leading and trailing whitespace
+    css = re.sub(r'^[ \t]*|[ \t]*$', '', css, flags=re.MULTILINE)
+
+    # Step 3: Remove spaces after specific characters ({ : ; ,})
+    css = re.sub(r'([:{;,])\s+', r'\1', css)
+
+    # Step 4: Remove spaces before `{`
+    css = re.sub(r'\s+{', '{', css)
+
+    # Step 5: Remove empty lines
+    css = re.sub(r'^\s*$', '', css, flags=re.MULTILINE)
+
+    # Step 6: Collapse all lines into one
+    css = re.sub(r'\n', '', css)
+
+    return css
+
+
 with open(Path(__file__).resolve().parent / '../css/html_readable_style.css', 'r') as f:
    readable_css = f.read()
 with open(Path(__file__).resolve().parent / '../css/html_instruct_style.css', 'r') as f:
@ -34,6 +58,12 @@ for k in chat_styles:
        style = match.group(1)
        chat_styles[k] = chat_styles.get(style, '') + '\n\n' + '\n'.join(lines[1:])

+# Reduce the size of the CSS sources above
+readable_css = minify_css(readable_css)
+instruct_css = minify_css(instruct_css)
+for k in chat_styles:
+    chat_styles[k] = minify_css(chat_styles[k])
+

 def fix_newlines(string):
    string = string.replace('\n', '\n\n')
@ -174,7 +204,7 @@ def convert_to_markdown(string):
            result += '\n'
        # Also don't add an extra \n for lists
        elif stripped_line.startswith('-') or stripped_line.startswith('*') or stripped_line.startswith('+') or stripped_line.startswith('>') or re.match(r'\d+\.', stripped_line):
-            result += '\n'
+            result += '  \n'
        else:
            result += '  \n'

@ -195,7 +225,7 @@ def convert_to_markdown(string):
        result = re.sub(list_item_pattern, r'\g<1> ' + delete_str, result)

        # Convert to HTML using markdown
-        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])

        # Remove the delete string from the HTML output
        pos = html_output.rfind(delete_str)
@ -203,7 +233,7 @@ def convert_to_markdown(string):
            html_output = html_output[:pos] + html_output[pos + len(delete_str):]
    else:
        # Convert to HTML using markdown
-        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])

    # Unescape code blocks
    pattern = re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL)
@ -267,29 +297,24 @@ def generate_instruct_html(history):
    for i, _row in enumerate(history):
        row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]

-        if row[0]:  # don't display empty user messages
-            output += f"""
-                  <div class="user-message">
-                    <div class="text">
-                      <div class="message-body">
-                        {row[0]}
-                      </div>
-                    </div>
-                  </div>
-                """
+        if row[0]:  # Don't display empty user messages
+            output += (
+                f'<div class="user-message">'
+                f'<div class="text">'
+                f'<div class="message-body">{row[0]}</div>'
+                f'</div>'
+                f'</div>'
+            )

-        output += f"""
-              <div class="assistant-message">
-                <div class="text">
-                  <div class="message-body">
-                    {row[1]}
-                  </div>
-                </div>
-              </div>
-            """
+        output += (
+            f'<div class="assistant-message">'
+            f'<div class="text">'
+            f'<div class="message-body">{row[1]}</div>'
+            f'</div>'
+            f'</div>'
+        )

    output += "</div></div>"
-
    return output


@ -297,44 +322,39 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
    output = f'<style>{chat_styles[style]}</style><div class="chat" id="chat"><div class="messages">'

    # We use ?character and ?time.time() to force the browser to reset caches
-    img_bot = f'<img src="file/cache/pfp_character_thumb.png?{character}" class="pfp_character">' if Path("cache/pfp_character_thumb.png").exists() else ''
-    img_me = f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">' if Path("cache/pfp_me.png").exists() else ''
+    img_bot = (
+        f'<img src="file/cache/pfp_character_thumb.png?{character}" class="pfp_character">'
+        if Path("cache/pfp_character_thumb.png").exists() else ''
+    )
+
+    img_me = (
+        f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">'
+        if Path("cache/pfp_me.png").exists() else ''
+    )

    for i, _row in enumerate(history):
        row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]

-        if row[0]:  # don't display empty user messages
-            output += f"""
-                  <div class="message">
-                    <div class="circle-you">
-                      {img_me}
-                    </div>
-                    <div class="text">
-                      <div class="username">
-                        {name1}
-                      </div>
-                      <div class="message-body">
-                        {row[0]}
-                      </div>
-                    </div>
-                  </div>
-                """
+        if row[0]:  # Don't display empty user messages
+            output += (
+                f'<div class="message">'
+                f'<div class="circle-you">{img_me}</div>'
+                f'<div class="text">'
+                f'<div class="username">{name1}</div>'
+                f'<div class="message-body">{row[0]}</div>'
+                f'</div>'
+                f'</div>'
+            )

-        output += f"""
-              <div class="message">
-                <div class="circle-bot">
-                  {img_bot}
-                </div>
-                <div class="text">
-                  <div class="username">
-                    {name2}
-                  </div>
-                  <div class="message-body">
-                    {row[1]}
-                  </div>
-                </div>
-              </div>
-            """
+        output += (
+            f'<div class="message">'
+            f'<div class="circle-bot">{img_bot}</div>'
+            f'<div class="text">'
+            f'<div class="username">{name2}</div>'
+            f'<div class="message-body">{row[1]}</div>'
+            f'</div>'
+            f'</div>'
+        )

    output += "</div></div>"
    return output
@ -346,26 +366,22 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
    for i, _row in enumerate(history):
        row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]

-        if row[0]:  # don't display empty user messages
-            output += f"""
-              <div class="message">
-                <div class="text-you">
-                  <div class="message-body">
-                    {row[0]}
-                  </div>
-                </div>
-              </div>
-            """
+        if row[0]:  # Don't display empty user messages
+            output += (
+                f'<div class="message">'
+                f'<div class="text-you">'
+                f'<div class="message-body">{row[0]}</div>'
+                f'</div>'
+                f'</div>'
+            )

-        output += f"""
-          <div class="message">
-            <div class="text-bot">
-              <div class="message-body">
-                {row[1]}
-              </div>
-            </div>
-          </div>
-        """
+        output += (
+            f'<div class="message">'
+            f'<div class="text-bot">'
+            f'<div class="message-body">{row[1]}</div>'
+            f'</div>'
+            f'</div>'
+        )

    output += "</div></div>"
    return output
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@ -122,7 +122,14 @@ class LlamaCppModel:
        return self.model.tokenize(string)

    def decode(self, ids, **kwargs):
-        return self.model.detokenize(ids).decode('utf-8')
+        detokenized = self.model.detokenize(ids)
+        try:
+            # Attempt strict UTF-8 decoding first
+            return detokenized.decode('utf-8', 'strict')
+        except UnicodeDecodeError as e:
+            # Log the error and fall back to UTF-8 with replacement
+            logger.warning(f"Invalid UTF-8 in detokenized output. Using replacement characters.\n{e}")
+            return detokenized.decode('utf-8', 'replace')

    def get_logits(self, tokens):
        self.model.reset()
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -9,12 +9,13 @@ loaders_and_params = OrderedDict({
    'Transformers': [
        'cpu_memory',
        'gpu_memory',
+        'load_in_4bit',
        'load_in_8bit',
+        'torch_compile',
        'bf16',
        'cpu',
        'disk',
        'auto_devices',
-        'load_in_4bit',
        'use_double_quant',
        'quant_type',
        'compute_dtype',
@ -24,8 +25,6 @@ loaders_and_params = OrderedDict({
        'use_eager_attention',
        'alpha_value',
        'compress_pos_emb',
-        'disable_exllama',
-        'disable_exllamav2',
    ],
    'llama.cpp': [
        'n_ctx',
@ -106,24 +105,6 @@ loaders_and_params = OrderedDict({
        'compress_pos_emb',
        'exllamav2_info',
    ],
-    'AutoGPTQ': [
-        'triton',
-        'no_inject_fused_mlp',
-        'no_use_cuda_fp16',
-        'wbits',
-        'groupsize',
-        'desc_act',
-        'disable_exllama',
-        'disable_exllamav2',
-        'gpu_memory',
-        'cpu_memory',
-        'cpu',
-        'disk',
-        'auto_devices',
-        'trust_remote_code',
-        'no_use_fast',
-        'autogptq_info',
-    ],
    'HQQ': [
        'hqq_backend',
        'trust_remote_code',
@ -183,17 +164,21 @@ def transformers_samplers():
        'add_bos_token',
        'skip_special_tokens',
        'auto_max_new_tokens',
-        'prompt_lookup_num_tokens'
+        'prompt_lookup_num_tokens',
+        'static_cache',
    }


 loaders_samplers = {
    'Transformers': transformers_samplers(),
-    'AutoGPTQ': transformers_samplers(),
    'HQQ': transformers_samplers(),
    'ExLlamav2': {
        'temperature',
        'temperature_last',
+        'smoothing_factor',
+        'dynatemp_low',
+        'dynatemp_high',
+        'dynatemp_exponent',
        'top_p',
        'min_p',
        'top_k',
@ -204,10 +189,16 @@ loaders_samplers = {
        'presence_penalty',
        'frequency_penalty',
        'repetition_penalty_range',
-        'seed',
        'mirostat_mode',
        'mirostat_tau',
        'mirostat_eta',
+        'dry_multiplier',
+        'dry_base',
+        'dry_allowed_length',
+        'dry_sequence_breakers',
+        'xtc_threshold',
+        'xtc_probability',
+        'seed',
        'ban_eos_token',
        'add_bos_token',
        'custom_token_bans',
--- a/modules/logits.py
+++ b/modules/logits.py
@ -2,11 +2,10 @@ import time
 import traceback

 import torch
-from transformers import is_torch_npu_available, is_torch_xpu_available

 from modules import models, sampler_hijack, shared
 from modules.logging_colors import logger
-from modules.models import load_model
+from modules.models import get_device, load_model
 from modules.text_generation import generate_reply

 global_scores = None
@ -57,23 +56,21 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
        scores = sampler_hijack.global_scores[-1]
    else:
        if is_non_hf_exllamav2:
-            if is_torch_xpu_available():
-                tokens = shared.tokenizer.encode(prompt).to("xpu:0")
-            elif is_torch_npu_available():
-                tokens = shared.tokenizer.encode(prompt).to("npu:0")
-            else:
-                tokens = shared.tokenizer.encode(prompt).cuda()
+            device = get_device()
+            tokens = shared.tokenizer.encode(prompt)
+            if device:
+                tokens = tokens.to(device)
+
            scores = shared.model.get_logits(tokens)[-1][-1]
        elif is_non_hf_llamacpp:
            tokens = shared.tokenizer.encode(prompt)
            scores = shared.model.get_logits(tokens)[-1][-1]
        else:
-            if is_torch_xpu_available():
-                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0")
-            elif is_torch_npu_available():
-                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("npu:0")
-            else:
-                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
+            device = get_device()
+            tokens = shared.tokenizer.encode(prompt, return_tensors='pt')
+            if device:
+                tokens = tokens.to(device)
+
            output = shared.model(input_ids=tokens)
            scores = output['logits'][-1][-1]

--- a/modules/models.py
+++ b/modules/models.py
@ -3,7 +3,6 @@ import os
 import pprint
 import re
 import time
-import traceback
 from pathlib import Path

 import torch
@ -21,11 +20,11 @@ from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
-    GPTQConfig
+    is_torch_npu_available,
+    is_torch_xpu_available
 )

 import modules.shared as shared
-from modules import sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import get_model_metadata

@ -56,8 +55,6 @@ if shared.args.deepspeed:
    ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
    dschf = HfDeepSpeedConfig(ds_config)  # Keep this object alive for the Transformers integration

-sampler_hijack.hijack_samplers()
-

 last_generation_time = time.time()

@ -74,7 +71,6 @@ def load_model(model_name, loader=None):
        'llamacpp_HF': llamacpp_HF_loader,
        'ExLlamav2': ExLlamav2_loader,
        'ExLlamav2_HF': ExLlamav2_HF_loader,
-        'AutoGPTQ': AutoGPTQ_loader,
        'HQQ': HQQ_loader,
        'TensorRT-LLM': TensorRT_LLM_loader,
    }
@ -90,6 +86,7 @@ def load_model(model_name, loader=None):
                raise ValueError

    shared.args.loader = loader
+    clear_torch_cache()
    output = load_func_map[loader](model_name)
    if type(output) is tuple:
        model, tokenizer = output
@ -163,30 +160,48 @@ def huggingface_loader(model_name):
        else:
            LoaderClass = AutoModelForCausalLM

+    # Determine if we should use default loading
+    should_use_default_loading = not any([
+        shared.args.cpu,
+        shared.args.load_in_8bit,
+        shared.args.load_in_4bit,
+        shared.args.auto_devices,
+        shared.args.disk,
+        shared.args.deepspeed,
+        shared.args.gpu_memory is not None,
+        shared.args.cpu_memory is not None,
+        shared.args.compress_pos_emb > 1,
+        shared.args.alpha_value > 1,
+    ])
+
    # Load the model without any special settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama, shared.args.disable_exllamav2]):
+    if should_use_default_loading:
        logger.info("TRANSFORMERS_PARAMS=")
        pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
        print()

        model = LoaderClass.from_pretrained(path_to_model, **params)
        if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
-            if torch.backends.mps.is_available():
-                device = torch.device('mps')
+            device = get_device()
+            if device:
                model = model.to(device)
-            elif is_xpu_available():
-                device = torch.device("xpu")
-                model = model.to(device)
-            elif is_npu_available():
-                device = torch.device("npu")
-                model = model.to(device)
-            else:
-                model = model.cuda()

    # DeepSpeed ZeRO-3
    elif shared.args.deepspeed:
-        model = LoaderClass.from_pretrained(path_to_model, torch_dtype=params['torch_dtype'], trust_remote_code=params.get('trust_remote_code'))
-        model = deepspeed.initialize(model=model, config_params=ds_config, model_parameters=None, optimizer=None, lr_scheduler=None)[0]
+        model = LoaderClass.from_pretrained(
+            path_to_model,
+            torch_dtype=params['torch_dtype'],
+            trust_remote_code=params.get('trust_remote_code')
+        )
+
+        model = deepspeed.initialize(
+            model=model,
+            config_params=ds_config,
+            model_parameters=None,
+            optimizer=None,
+            lr_scheduler=None
+        )[0]
+
        model.module.eval()  # Inference
        logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')

@ -208,16 +223,15 @@ def huggingface_loader(model_name):
                # and https://huggingface.co/blog/4bit-transformers-bitsandbytes
                quantization_config_params = {
                    'load_in_4bit': True,
-                    'bnb_4bit_compute_dtype': eval("torch.{}".format(shared.args.compute_dtype)) if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
+                    'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
                    'bnb_4bit_quant_type': shared.args.quant_type,
                    'bnb_4bit_use_double_quant': shared.args.use_double_quant,
                    'llm_int8_enable_fp32_cpu_offload': True
                }
-
                params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)

            elif shared.args.load_in_8bit:
-                if any((shared.args.auto_devices, shared.args.gpu_memory)):
+                if shared.args.auto_devices or shared.args.gpu_memory:
                    params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
                else:
                    params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
@ -237,21 +251,6 @@ def huggingface_loader(model_name):
            if shared.args.disk:
                params['offload_folder'] = shared.args.disk_cache_dir

-        if shared.args.disable_exllama or shared.args.disable_exllamav2:
-            try:
-                gptq_config = GPTQConfig(
-                    bits=config.quantization_config.get('bits', 4),
-                    disable_exllama=shared.args.disable_exllama,
-                    disable_exllamav2=shared.args.disable_exllamav2,
-                )
-
-                params['quantization_config'] = gptq_config
-                logger.info(f'Loading with disable_exllama={shared.args.disable_exllama} and disable_exllamav2={shared.args.disable_exllamav2}.')
-            except:
-                exc = traceback.format_exc()
-                logger.error('Failed to disable exllama. Does the config.json for this model contain the necessary quantization info?')
-                print(exc)
-
        if shared.args.compress_pos_emb > 1:
            params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
        elif shared.args.alpha_value > 1:
@ -262,6 +261,9 @@ def huggingface_loader(model_name):
        print()
        model = LoaderClass.from_pretrained(path_to_model, **params)

+    if shared.args.torch_compile:
+        model = torch.compile(model)
+
    return model


@ -315,15 +317,6 @@ def ExLlamav2_HF_loader(model_name):
    return Exllamav2HF.from_pretrained(model_name)


-def AutoGPTQ_loader(model_name):
-    try:
-        import modules.AutoGPTQ_loader
-    except ModuleNotFoundError:
-        raise ModuleNotFoundError("Failed to import 'autogptq'. Please install it manually following the instructions in the AutoGPTQ GitHub repository.")
-
-    return modules.AutoGPTQ_loader.load_quantized(model_name)
-
-
 def HQQ_loader(model_name):
    try:
        from hqq.core.quantize import HQQBackend, HQQLinear
@ -379,13 +372,34 @@ def get_max_memory_dict():
    return max_memory if len(max_memory) > 0 else None


+def get_device():
+    if torch.cuda.is_available():
+        return torch.device('cuda')
+    elif shared.args.deepspeed:
+        import deepspeed
+        return deepspeed.get_accelerator().current_device_name()
+    elif torch.backends.mps.is_available():
+        return torch.device('mps')
+    elif is_torch_xpu_available():
+        return torch.device('xpu:0')
+    elif is_torch_npu_available():
+        return torch.device('npu:0')
+    else:
+        return None
+
+
 def clear_torch_cache():
    gc.collect()
    if not shared.args.cpu:
-        if is_xpu_available():
-            torch.xpu.empty_cache()
-        else:
+        if torch.cuda.is_available():
            torch.cuda.empty_cache()
+        elif is_xpu_available():
+            torch.xpu.empty_cache()
+        elif is_npu_available():
+            torch.npu.empty_cache()
+        elif torch.backends.mps.is_available():
+            if hasattr(torch.backends.mps, 'empty_cache'):
+                torch.backends.mps.empty_cache()


 def unload_model(keep_model_name=False):
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -11,9 +11,6 @@ def get_fallback_settings():
    return {
        'bf16': False,
        'use_eager_attention': False,
-        'wbits': 'None',
-        'groupsize': 'None',
-        'desc_act': False,
        'max_seq_len': 2048,
        'n_ctx': 2048,
        'rope_freq_base': 0,
@ -111,26 +108,6 @@ def get_model_metadata(model):
            if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']:
                model_settings['use_eager_attention'] = True

-            # Read GPTQ metadata for old GPTQ loaders
-            if 'quantization_config' in metadata and metadata['quantization_config'].get('quant_method', '') != 'exl2':
-                if 'bits' in metadata['quantization_config']:
-                    model_settings['wbits'] = metadata['quantization_config']['bits']
-                if 'group_size' in metadata['quantization_config']:
-                    model_settings['groupsize'] = metadata['quantization_config']['group_size']
-                if 'desc_act' in metadata['quantization_config']:
-                    model_settings['desc_act'] = metadata['quantization_config']['desc_act']
-
-        # Read AutoGPTQ metadata
-        path = Path(f'{shared.args.model_dir}/{model}/quantize_config.json')
-        if path.exists():
-            metadata = json.loads(open(path, 'r', encoding='utf-8').read())
-            if 'bits' in metadata:
-                model_settings['wbits'] = metadata['bits']
-            if 'group_size' in metadata:
-                model_settings['groupsize'] = metadata['group_size']
-            if 'desc_act' in metadata:
-                model_settings['desc_act'] = metadata['desc_act']
-
    # Try to find the Jinja instruct template
    path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
    if path.exists():
@ -178,7 +155,7 @@ def infer_loader(model_name, model_settings):
    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
    if not path_to_model.exists():
        loader = None
-    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
+    elif (path_to_model / 'quantize_config.json').exists():  # Old GPTQ metadata file
        loader = 'ExLlamav2_HF'
    elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
        loader = 'llamacpp_HF'
@ -215,16 +192,11 @@ def update_model_parameters(state, initial=False):
        if initial and element in shared.provided_arguments:
            continue

-        # Setting null defaults
-        if element in ['wbits', 'groupsize'] and value == 'None':
-            value = vars(shared.args_defaults)[element]
-        elif element in ['cpu_memory'] and value == 0:
+        if element in ['cpu_memory'] and value == 0:
            value = vars(shared.args_defaults)[element]

        # Making some simple conversions
-        if element in ['wbits', 'groupsize']:
-            value = int(value)
-        elif element == 'cpu_memory' and value is not None:
+        if element == 'cpu_memory' and value is not None:
            value = f"{value}MiB"

        setattr(shared.args, element, value)
@ -251,15 +223,12 @@ def apply_model_settings_to_state(model, state):
        loader = model_settings.pop('loader')

        # If the user is using an alternative loader for the same model type, let them keep using it
-        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2', 'AutoGPTQ']):
+        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
            state['loader'] = loader

    for k in model_settings:
        if k in state:
-            if k in ['wbits', 'groupsize']:
-                state[k] = str(model_settings[k])
-            else:
-                state[k] = model_settings[k]
+            state[k] = model_settings[k]

    return state

--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@ -5,7 +5,7 @@ import random

 import torch
 import transformers
-from transformers import LogitsWarper, is_torch_xpu_available
+from transformers import LogitsWarper
 from transformers.generation.logits_process import (
    LogitNormalization,
    LogitsProcessor,
@ -14,6 +14,7 @@ from transformers.generation.logits_process import (

 from modules import shared
 from modules.logging_colors import logger
+from modules.models import get_device

 global_scores = None

@ -339,12 +340,12 @@ class MirostatLogitsWarper(LogitsWarper):
                break

        # Normalize the probabilities of the remaining words
-        if is_torch_xpu_available():
-            prob_topk = torch.softmax(sorted_logits, dim=0).to("xpu")
-            prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to("xpu")
-        else:
-            prob_topk = torch.softmax(sorted_logits, dim=0).to('cuda')
-            prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to('cuda')
+        prob_topk = torch.softmax(sorted_logits, dim=0)
+        prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True)
+        device = get_device()
+        if device:
+            prob_topk = prob_topk.to(device)
+            prev_i = prev_i.to(device)

        observed_surprise = -math.log2(prob_topk[prev_i])
        self.e = observed_surprise - self.mirostat_tau
@ -494,7 +495,9 @@ def get_logits_processor_patch(self, **kwargs):

        sequence_breaker_strings = json.loads(dry_sequence_breakers)
        # Prefix with 'a' to get the correct encoding of the token at the end of a text.
-        sequence_breakers = {shared.tokenizer.encode(f'a{s}')[-1] for s in sequence_breaker_strings}
+        sequence_breakers = {
+            shared.tokenizer.encode(f'a{s}')[-1] for s in sequence_breaker_strings
+        }

        warpers.append(
            DRYLogitsProcessor(
--- a/modules/sane_markdown_lists.py
+++ b/modules/sane_markdown_lists.py
@ -0,0 +1,336 @@
+# Code based on the Sane List Extension for Python-Markdown
+# =======================================
+
+# Modify the behavior of Lists in Python-Markdown to act in a sane manner.
+
+# See https://Python-Markdown.github.io/extensions/sane_lists
+# for documentation.
+
+# Original code Copyright 2011 [Waylan Limberg](http://achinghead.com)
+
+# All changes Copyright 2011-2014 The Python Markdown Project
+
+# License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+"""
+Modify the behavior of Lists in Python-Markdown to act in a sane manner.
+"""
+
+from __future__ import annotations
+
+import re
+import xml.etree.ElementTree as etree
+from typing import TYPE_CHECKING
+
+from markdown import Extension
+from markdown.blockparser import BlockParser
+from markdown.blockprocessors import (
+    ListIndentProcessor,
+    OListProcessor,
+    ParagraphProcessor
+)
+
+if TYPE_CHECKING:  # pragma: no cover
+    from markdown import blockparser
+
+
+# The min. number of added leading spaces needed to start a nested list
+MIN_NESTED_LIST_INDENT = 2
+assert MIN_NESTED_LIST_INDENT > 1, "'MIN_NESTED_LIST_INDENT' must be > 1"
+
+
+class SaneListIndentProcessor(ListIndentProcessor):
+    """ Process children of list items.
+
+    Example
+
+        * a list item
+            process this part
+
+            or this part
+
+    """
+
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.INDENT_RE = re.compile(r'^(([ ])+)')
+
+    def test(self, parent: etree.Element, block: str) -> bool:
+        return block.startswith(' ' * MIN_NESTED_LIST_INDENT) and \
+            not self.parser.state.isstate('detabbed') and \
+            (parent.tag in self.ITEM_TYPES or
+                (len(parent) and parent[-1] is not None and
+                    (parent[-1].tag in self.LIST_TYPES)))
+
+    def get_level(self, parent: etree.Element, block: str) -> tuple[int, etree.Element]:
+        """ Get level of indentation based on list level. """
+        # Get indent level
+        m = self.INDENT_RE.match(block)
+        if m:
+            indent_level = len(m.group(1)) / MIN_NESTED_LIST_INDENT
+        else:
+            indent_level = 0
+        if self.parser.state.isstate('list'):
+            # We're in a tight-list - so we already are at correct parent.
+            level = 1
+        else:
+            # We're in a loose-list - so we need to find parent.
+            level = 0
+        # Step through children of tree to find matching indent level.
+        while indent_level > level:
+            child = self.lastChild(parent)
+            if (child is not None and
+                    (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)):
+                if child.tag in self.LIST_TYPES:
+                    level += 1
+                parent = child
+            else:
+                # No more child levels. If we're short of `indent_level`,
+                # we have a code block. So we stop here.
+                break
+        return level, parent
+
+    def detab(self, text: str, length: int | None = None) -> tuple[str, str]:
+        """ Remove a tab from the front of each line of the given text. """
+        if length is None:
+            length = MIN_NESTED_LIST_INDENT
+        newtext = []
+        lines = text.split('\n')
+        for line in lines:
+            if line.startswith(' ' * length):
+                newtext.append(line[length:])
+            elif not line.strip():
+                newtext.append('')
+            else:
+                break
+        return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
+
+    def looseDetab(self, text: str, level: int = 1) -> str:
+        """ Remove indentation from front of lines but allowing dedented lines. """
+        lines = text.split('\n')
+        for i in range(len(lines)):
+            if lines[i].startswith(' ' * MIN_NESTED_LIST_INDENT * level):
+                lines[i] = lines[i][MIN_NESTED_LIST_INDENT * level:]
+        return '\n'.join(lines)
+
+
+class SaneOListProcessor(OListProcessor):
+    """ Override `SIBLING_TAGS` to not include `ul` and set `LAZY_OL` to `False`. """
+
+    SIBLING_TAGS = ['ol']
+    """ Exclude `ul` from list of siblings. """
+    LAZY_OL = False
+    """ Disable lazy list behavior. """
+
+    def __init__(self, parser: blockparser.BlockParser):
+        super().__init__(parser)
+        # This restriction stems from the 'CodeBlockProcessor' class,
+        # which automatically matches blocks with an indent = self.tab_length
+        max_list_start_indent = self.tab_length - 1
+        # Detect an item (e.g., `1. item`)
+        self.RE = re.compile(r'^[ ]{0,%d}[\*_]{0,2}\d+\.[ ]+(.*)' % max_list_start_indent)
+        # Detect items on secondary lines. they can be of either list type.
+        self.CHILD_RE = re.compile(r'^[ ]{0,%d}([\*_]{0,2})((\d+\.))[ ]+(.*)' % (MIN_NESTED_LIST_INDENT - 1))
+        # Detect indented (nested) items of either type
+        self.INDENT_RE = re.compile(r'^[ ]{%d,%d}[\*_]{0,2}((\d+\.)|[*+-])[ ]+.*' %
+                                    (MIN_NESTED_LIST_INDENT, self.tab_length * 2 - 1))
+
+    def run(self, parent: etree.Element, blocks: list[str]) -> None:
+        # Check for multiple items in one block.
+        items = self.get_items(blocks.pop(0))
+        sibling = self.lastChild(parent)
+
+        if sibling is not None and sibling.tag in self.SIBLING_TAGS:
+            # Previous block was a list item, so set that as parent
+            lst = sibling
+            # make sure previous item is in a `p` - if the item has text,
+            # then it isn't in a `p`
+            if lst[-1].text:
+                # since it's possible there are other children for this
+                # sibling, we can't just `SubElement` the `p`, we need to
+                # insert it as the first item.
+                p = etree.Element('p')
+                p.text = lst[-1].text
+                lst[-1].text = ''
+                lst[-1].insert(0, p)
+            # if the last item has a tail, then the tail needs to be put in a `p`
+            # likely only when a header is not followed by a blank line
+            lch = self.lastChild(lst[-1])
+            if lch is not None and lch.tail:
+                p = etree.SubElement(lst[-1], 'p')
+                p.text = lch.tail.lstrip()
+                lch.tail = ''
+
+            # parse first block differently as it gets wrapped in a `p`.
+            li = etree.SubElement(lst, 'li')
+            self.parser.state.set('looselist')
+            firstitem = items.pop(0)
+            self.parser.parseBlocks(li, [firstitem])
+            self.parser.state.reset()
+        elif parent.tag in ['ol', 'ul']:
+            # this catches the edge case of a multi-item indented list whose
+            # first item is in a blank parent-list item:
+            #     * * subitem1
+            #         * subitem2
+            # see also `ListIndentProcessor`
+            lst = parent
+        else:
+            # This is a new list so create parent with appropriate tag.
+            lst = etree.SubElement(parent, self.TAG)
+            # Check if a custom start integer is set
+            if not self.LAZY_OL and self.STARTSWITH != '1':
+                lst.attrib['start'] = self.STARTSWITH
+
+        self.parser.state.set('list')
+        # Loop through items in block, recursively parsing each with the
+        # appropriate parent.
+        for item in items:
+            if item.startswith(" " * MIN_NESTED_LIST_INDENT):
+                # Item is indented. Parse with last item as parent
+                self.parser.parseBlocks(lst[-1], [item])
+            else:
+                # New item. Create `li` and parse with it as parent
+                li = etree.SubElement(lst, 'li')
+                self.parser.parseBlocks(li, [item])
+        self.parser.state.reset()
+
+    def looseDetab(self, text: str, indent_length: int, level: int = 1) -> str:
+        """ Remove indentation from front of lines but allowing dedented lines. """
+        lines = text.split('\n')
+        for i in range(len(lines)):
+            if lines[i].startswith(' ' * indent_length * level):
+                lines[i] = lines[i][indent_length * level:]
+        return '\n'.join(lines)
+
+    def get_items(self, block: str) -> list[str]:
+        """ Break a block into list items. """
+        # If first level of list is indented, remove that indentation
+        if (indent_len := len(block) - len(block.lstrip())) > 0:
+            block = self.looseDetab(block, indent_len)
+        items = []
+        for line in block.split('\n'):
+            m = self.CHILD_RE.match(line)
+            if m:
+                # This is a new list item
+                # Check first item for the start index
+                if not items:
+                    # Detect the integer value of first list item
+                    INTEGER_RE = re.compile(r'(\d+)')
+                    self.STARTSWITH = INTEGER_RE.match(m.group(2)).group()
+                # Append to the list
+                items.append(m.group(1) + m.group(4))
+            elif self.INDENT_RE.match(line):
+                # This is an indented (possibly nested) item.
+                if items[-1].startswith(' ' * MIN_NESTED_LIST_INDENT):
+                    # Previous item was indented. Append to that item.
+                    items[-1] = '{}\n{}'.format(items[-1], line)
+                else:
+                    items.append(line)
+            else:
+                # This is another line of previous item. Append to that item.
+                items[-1] = '{}\n{}'.format(items[-1], line)
+        return items
+
+
+class SaneUListProcessor(SaneOListProcessor):
+    """ Override `SIBLING_TAGS` to not include `ol`. """
+
+    TAG: str = 'ul'
+    SIBLING_TAGS = ['ul']
+    """ Exclude `ol` from list of siblings. """
+
+    def __init__(self, parser: blockparser.BlockParser):
+        super().__init__(parser)
+        # Detect an item (e.g., `- item` or `+ item` or `* item`).
+        max_list_start_indent = self.tab_length - 1
+        self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % max_list_start_indent)
+        self.CHILD_RE = re.compile(r'^[ ]{0,%d}(([*+-]))[ ]+(.*)' % (MIN_NESTED_LIST_INDENT - 1))
+
+    def get_items(self, block: str) -> list[str]:
+        """ Break a block into list items. """
+        # If first level of list is indented, remove that indentation
+        if (indent_len := len(block) - len(block.lstrip())) > 0:
+            block = self.looseDetab(block, indent_len)
+        items = []
+        for line in block.split('\n'):
+            m = self.CHILD_RE.match(line)
+            if m:
+                # Append to the list
+                items.append(m.group(3))
+            elif self.INDENT_RE.match(line):
+                # This is an indented (possibly nested) item.
+                if items[-1].startswith(' ' * MIN_NESTED_LIST_INDENT):
+                    # Previous item was indented. Append to that item.
+                    items[-1] = '{}\n{}'.format(items[-1], line)
+                else:
+                    items.append(line)
+            else:
+                # This is another line of previous item. Append to that item.
+                items[-1] = '{}\n{}'.format(items[-1], line)
+        return items
+
+
+class SaneParagraphProcessor(ParagraphProcessor):
+    """ Process Paragraph blocks. """
+
+    def __init__(self, parser: BlockParser):
+        super().__init__(parser)
+        max_list_start_indent = self.tab_length - 1
+        self.LIST_RE = re.compile(r"\s{2}\n(\s{0,%d}[\d+*-])" % max_list_start_indent)
+
+    def run(self, parent: etree.Element, blocks: list[str]) -> None:
+        block = blocks.pop(0)
+        if block.strip():
+            # Not a blank block. Add to parent, otherwise throw it away.
+            if self.parser.state.isstate('list'):
+                # The parent is a tight-list.
+                #
+                # Check for any children. This will likely only happen in a
+                # tight-list when a header isn't followed by a blank line.
+                # For example:
+                #
+                #     * # Header
+                #     Line 2 of list item - not part of header.
+                sibling = self.lastChild(parent)
+                if sibling is not None:
+                    # Insert after sibling.
+                    if sibling.tail:
+                        sibling.tail = '{}\n{}'.format(sibling.tail, block)
+                    else:
+                        sibling.tail = '\n%s' % block
+                else:
+                    # Append to parent.text
+                    if parent.text:
+                        parent.text = '{}\n{}'.format(parent.text, block)
+                    else:
+                        parent.text = block.lstrip()
+            else:
+                # Check if paragraph contains a list
+                next_list_block = None
+                if list_match := self.LIST_RE.search(block):
+                    list_start = list_match.end() - len(list_match.group(1))
+                    next_list_block = block[list_start:]
+                    block = block[:list_start]
+
+                # Create a regular paragraph
+                p = etree.SubElement(parent, 'p')
+                p.text = block.lstrip()
+
+                # If a list was found, parse its block separately with the paragraph as the parent
+                if next_list_block:
+                    self.parser.parseBlocks(p, [next_list_block])
+
+
+class SaneListExtension(Extension):
+    """ Add sane lists to Markdown. """
+
+    def extendMarkdown(self, md):
+        """ Override existing Processors. """
+        md.parser.blockprocessors.register(SaneListIndentProcessor(md.parser), 'indent', 90)
+        md.parser.blockprocessors.register(SaneOListProcessor(md.parser), 'olist', 40)
+        md.parser.blockprocessors.register(SaneUListProcessor(md.parser), 'ulist', 30)
+        md.parser.blockprocessors.register(SaneParagraphProcessor(md.parser), 'paragraph', 10)
+
+
+def makeExtension(**kwargs):  # pragma: no cover
+    return SaneListExtension(**kwargs)
--- a/modules/shared.py
+++ b/modules/shared.py
@ -46,6 +46,7 @@ settings = {
    'max_tokens_second': 0,
    'max_updates_second': 0,
    'prompt_lookup_num_tokens': 0,
+    'static_cache': False,
    'custom_stopping_strings': '',
    'custom_token_bans': '',
    'auto_max_new_tokens': False,
@ -85,7 +86,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft

 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2.')

 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@ -103,6 +104,7 @@ group.add_argument('--force-safetensors', action='store_true', help='Set use_saf
 group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
 group.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
 group.add_argument('--use_eager_attention', action='store_true', help='Set attn_implementation= eager while loading the model.')
+group.add_argument('--torch-compile', action='store_true', help='Compile the model with torch.compile for improved performance.')

 # bitsandbytes 4-bit
 group = parser.add_argument_group('bitsandbytes 4-bit')
@ -145,17 +147,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
 group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
 group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')

-# AutoGPTQ
-group = parser.add_argument_group('AutoGPTQ')
-group.add_argument('--triton', action='store_true', help='Use triton.')
-group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
-group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
-group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
-group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
-group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
-group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
-group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
-
 # HQQ
 group = parser.add_argument_group('HQQ')
 group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
@ -202,6 +193,8 @@ group.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudf
 group.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.')
 group.add_argument('--api-key', type=str, default='', help='API authentication key.')
 group.add_argument('--admin-key', type=str, default='', help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.')
+group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 for the API')
+group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API')
 group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')

 # Multimodal
@ -218,6 +211,14 @@ group.add_argument('--no_inject_fused_attention', action='store_true', help='DEP
 group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED')
 group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED')
 group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED')
+group.add_argument('--triton', action='store_true', help='DEPRECATED')
+group.add_argument('--no_inject_fused_mlp', action='store_true', help='DEPRECATED')
+group.add_argument('--no_use_cuda_fp16', action='store_true', help='DEPRECATED')
+group.add_argument('--desc_act', action='store_true', help='DEPRECATED')
+group.add_argument('--disable_exllama', action='store_true', help='DEPRECATED')
+group.add_argument('--disable_exllamav2', action='store_true', help='DEPRECATED')
+group.add_argument('--wbits', type=int, default=0, help='DEPRECATED')
+group.add_argument('--groupsize', type=int, default=-1, help='DEPRECATED')

 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@ -260,10 +261,6 @@ def fix_loader_name(name):
        return 'llamacpp_HF'
    elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
        return 'Transformers'
-    elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']:
-        return 'AutoGPTQ'
-    elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
-        return 'ExLlama'
    elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
        return 'ExLlamav2'
    elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -16,7 +16,7 @@ from transformers import (
 )

 import modules.shared as shared
-from modules import models
+from modules import models, sampler_hijack
 from modules.cache_utils import process_llamacpp_cache
 from modules.callbacks import (
    Iteratorize,
@ -28,7 +28,9 @@ from modules.grammar.grammar_utils import initialize_grammar
 from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
 from modules.html_generator import generate_basic_html
 from modules.logging_colors import logger
-from modules.models import clear_torch_cache, load_model
+from modules.models import clear_torch_cache, get_device, load_model
+
+sampler_hijack.hijack_samplers()


 def generate_reply(*args, **kwargs):
@ -79,7 +81,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
            all_stop_strings += st

    shared.stop_everything = False
-    clear_torch_cache()
    seed = set_manual_seed(state['seed'])
    last_update = -1
    reply = ''
@ -160,18 +161,12 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt

    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
        return input_ids
-    elif shared.args.deepspeed:
-        import deepspeed
-        return input_ids.to(deepspeed.get_accelerator().current_device_name())
-    elif torch.backends.mps.is_available():
-        device = torch.device('mps')
-        return input_ids.to(device)
-    elif is_torch_xpu_available():
-        return input_ids.to("xpu:0")
-    elif is_torch_npu_available():
-        return input_ids.to("npu:0")
    else:
-        return input_ids.cuda()
+        device = get_device()
+        if device:
+            return input_ids.to(device)
+
+        return input_ids


 def decode(output_ids, skip_special_tokens=True):
@ -288,6 +283,9 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):


 def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
+    if shared.args.loader == 'Transformers':
+        clear_torch_cache()
+
    generate_params = {}
    for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'smoothing_curve', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_sequence_breakers', 'xtc_threshold', 'xtc_probability']:
        if k in state:
@ -304,6 +302,9 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
    if state['prompt_lookup_num_tokens'] > 0:
        generate_params['prompt_lookup_num_tokens'] = state['prompt_lookup_num_tokens']

+    if state['static_cache']:
+        generate_params['cache_implementation'] = 'static'
+
    for k in ['epsilon_cutoff', 'eta_cutoff']:
        if state[k] > 0:
            generate_params[k] = state[k] * 1e-4
@ -326,7 +327,6 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
    # Encode the input
    input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
    output = input_ids[0]
-    cuda = not any((shared.args.cpu, shared.args.deepspeed))
    if state['auto_max_new_tokens']:
        generate_params['max_new_tokens'] = state['truncation_length'] - input_ids.shape[-1]

@ -381,8 +381,9 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
        if not state['stream']:
            with torch.no_grad():
                output = shared.model.generate(**generate_params)[0]
-                if cuda:
-                    output = output.cuda()
+                device = get_device()
+                if device:
+                    output = output.to(device)

            starting_from = 0 if shared.is_seq2seq else len(input_ids[0])
            yield get_reply_from_output_ids(output, state, starting_from=starting_from)
@ -393,7 +394,6 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings

            def generate_with_callback(callback=None, *args, **kwargs):
                kwargs['stopping_criteria'].append(Stream(callback_func=callback))
-                clear_torch_cache()
                with torch.no_grad():
                    shared.model.generate(**kwargs)

--- a/modules/ui.py
+++ b/modules/ui.py
@ -109,23 +109,16 @@ def list_model_elements():
        'disk',
        'cpu',
        'bf16',
+        'load_in_4bit',
        'load_in_8bit',
+        'torch_compile',
        'trust_remote_code',
        'no_use_fast',
        'use_flash_attention_2',
        'use_eager_attention',
-        'load_in_4bit',
        'compute_dtype',
        'quant_type',
        'use_double_quant',
-        'wbits',
-        'groupsize',
-        'triton',
-        'desc_act',
-        'no_inject_fused_mlp',
-        'no_use_cuda_fp16',
-        'disable_exllama',
-        'disable_exllamav2',
        'cfg_cache',
        'no_flash_attn',
        'no_xformers',
@ -220,6 +213,7 @@ def list_interface_input_elements():
        'custom_stopping_strings',
        'skip_special_tokens',
        'stream',
+        'static_cache',
        'tfs',
        'top_a',
    ]
@ -230,6 +224,7 @@ def list_interface_input_elements():
        'start_with',
        'character_menu',
        'history',
+        'search_chat',
        'unique_id',
        'name1',
        'user_bio',
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@ -18,16 +18,19 @@ def create_ui():
    mu = shared.args.multi_user

    shared.gradio['Chat input'] = gr.State()
-    shared.gradio['history'] = gr.JSON({'internal': [], 'visible': []}, visible=False)
+    shared.gradio['history'] = gr.JSON(visible=False)

    with gr.Tab('Chat', elem_id='chat-tab'):
        with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
            with gr.Column():
                with gr.Row(elem_id='past-chats-buttons'):
+                    shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', interactive=not mu)
                    shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
                    shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
                    shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])

+                shared.gradio['search_chat'] = gr.Textbox(placeholder='Search chats...', max_lines=1, elem_id='search_chat')
+
                with gr.Row(elem_id='delete-chat-row', visible=False) as shared.gradio['delete-chat-row']:
                    shared.gradio['delete_chat-cancel'] = gr.Button('Cancel', elem_classes=['refresh-button', 'focus-on-chat-input'])
                    shared.gradio['delete_chat-confirm'] = gr.Button('Confirm', variant='stop', elem_classes=['refresh-button', 'focus-on-chat-input'])
@ -250,6 +253,10 @@ def create_event_handlers():
        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
        chat.handle_delete_chat_confirm_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'delete-chat-row'), show_progress=False)

+    shared.gradio['branch_chat'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
+
    shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False)
    shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False)
    shared.gradio['rename_to-confirm'].click(
@ -260,6 +267,10 @@ def create_event_handlers():
        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
        chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename-row'), show_progress=False)

+    shared.gradio['search_chat'].change(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_search_chat_change, gradio('interface_state'), gradio('unique_id'), show_progress=False)
+
    shared.gradio['load_chat_history'].upload(
        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
        chat.handle_upload_chat_history, gradio('load_chat_history', 'interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False).then(
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -89,8 +89,6 @@ def create_ui():
                            shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                            shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-                            shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
-                            shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
                            shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                            shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                            shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
@ -105,9 +103,10 @@ def create_ui():
                            shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')

                        with gr.Column():
-                            shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.')
+                            shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled without GGML_CUDA_FORCE_MMQ. This may improve performance on newer cards.')
                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
+                            shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                            shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
@ -120,10 +119,6 @@ def create_ui():
                            shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                            shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                            shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
-                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
-                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
-                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
-                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
                            shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
                            shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
@ -135,13 +130,10 @@ def create_ui():
                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                            shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
                            shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
-                            shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.')
-                            shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
                            shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
                            shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
                            shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
-                            shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
                            shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')

            with gr.Column():
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@ -83,6 +83,7 @@ def create_ui(default_preset):
                            shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
                            shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
                            shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
+                            shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')

                        with gr.Column():
                            shared.gradio['truncation_length'] = gr.Number(precision=0, step=256, value=get_truncation_length(), label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
--- a/one_click.py
+++ b/one_click.py
@ -232,33 +232,45 @@ def get_user_choice(question, options_dict):


 def install_webui():
-
    # Ask the user for the GPU vendor
    if "GPU_CHOICE" in os.environ:
        choice = os.environ["GPU_CHOICE"].upper()
        print_big_message(f"Selected GPU choice \"{choice}\" based on the GPU_CHOICE environment variable.")
+
+        # Warn about changed meanings and handle old NVIDIA choice
+        if choice == "B":
+            print_big_message("Warning: GPU_CHOICE='B' now means 'NVIDIA (CUDA 11.8)' in the new version.")
+        elif choice == "C":
+            print_big_message("Warning: GPU_CHOICE='C' now means 'AMD' in the new version.")
+        elif choice == "D":
+            print_big_message("Warning: GPU_CHOICE='D' now means 'Apple M Series' in the new version.")
+        elif choice == "A" and "USE_CUDA118" in os.environ:
+            choice = "B" if os.environ.get("USE_CUDA118", "").lower() in ("yes", "y", "true", "1", "t", "on") else "A"
    else:
        choice = get_user_choice(
            "What is your GPU?",
            {
-                'A': 'NVIDIA',
-                'B': 'AMD (Linux/MacOS only. Requires ROCm SDK 6.1 on Linux)',
-                'C': 'Apple M Series',
-                'D': 'Intel Arc (IPEX)',
-                'N': 'None (I want to run models in CPU mode)'
+                'A': 'NVIDIA - CUDA 12.1 (recommended)',
+                'B': 'NVIDIA - CUDA 11.8 (legacy GPUs)',
+                'C': 'AMD - Linux/macOS only, requires ROCm 6.1',
+                'D': 'Apple M Series',
+                'E': 'Intel Arc (beta)',
+                'N': 'CPU mode'
            },
        )

+    # Convert choices to GPU names for compatibility
    gpu_choice_to_name = {
        "A": "NVIDIA",
-        "B": "AMD",
-        "C": "APPLE",
-        "D": "INTEL",
+        "B": "NVIDIA",
+        "C": "AMD",
+        "D": "APPLE",
+        "E": "INTEL",
        "N": "NONE"
    }

    selected_gpu = gpu_choice_to_name[choice]
-    use_cuda118 = "N"
+    use_cuda118 = (choice == "B")  # CUDA version is now determined by menu choice

    # Write a flag to CMD_FLAGS.txt for CPU mode
    if selected_gpu == "NONE":
@ -267,18 +279,9 @@ def install_webui():
                print_big_message("Adding the --cpu flag to CMD_FLAGS.txt.")
                cmd_flags_file.write("\n--cpu\n")

-    # Check if the user wants CUDA 11.8
+    # Handle CUDA version display
    elif any((is_windows(), is_linux())) and selected_gpu == "NVIDIA":
-        if "USE_CUDA118" in os.environ:
-            use_cuda118 = "Y" if os.environ.get("USE_CUDA118", "").lower() in ("yes", "y", "true", "1", "t", "on") else "N"
-        else:
-            print("\nDo you want to use CUDA 11.8 instead of 12.1?\nOnly choose this option if your GPU is very old (Kepler or older).\n\nFor RTX and GTX series GPUs, say \"N\".\nIf unsure, say \"N\".\n")
-            use_cuda118 = input("Input (Y/N)> ").upper().strip('"\'').strip()
-            while use_cuda118 not in 'YN':
-                print("Invalid choice. Please try again.")
-                use_cuda118 = input("Input> ").upper().strip('"\'').strip()
-
-        if use_cuda118 == 'Y':
+        if use_cuda118:
            print("CUDA: 11.8")
        else:
            print("CUDA: 12.1")
@ -394,7 +397,7 @@ def update_requirements(initial_installation=False, pull=True):
        textgen_requirements = [
            req.replace('+cu121', '+cu118').replace('+cu122', '+cu118')
            for req in textgen_requirements
-            if "auto-gptq" not in req.lower() and "autoawq" not in req.lower()
+            if "autoawq" not in req.lower()
        ]

    if is_windows() and is_cuda118:  # No flash-attention on Windows for CUDA 11
--- a/requirements.txt
+++ b/requirements.txt
@ -17,7 +17,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
@ -32,22 +32,22 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"

 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
@ -31,14 +31,14 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.5+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.5+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.6+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.6+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
@ -31,10 +31,10 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"

 # AMD wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
@ -31,6 +31,8 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
@ -31,8 +31,10 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.6-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@ -17,7 +17,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
@ -32,22 +32,22 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.6+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"

 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.6+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.6+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@ -16,7 +16,7 @@ pydantic==2.8.2
 pyyaml
 requests
 rich
-safetensors==0.4.*
+safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
--- a/settings-template.yaml
+++ b/settings-template.yaml
@ -22,6 +22,7 @@ ban_eos_token: false
 add_bos_token: true
 skip_special_tokens: true
 stream: true
+static_cache: false
 character: Assistant
 name1: You
 custom_system_message: ''