From 16f4f1a1c33a57001ceb09afa10c27f5ae401fa8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 17 Feb 2025 17:20:10 -0800
Subject: [PATCH 01/16] Bump transformers to 4.49

---
 requirements.txt                 | 2 +-
 requirements_amd.txt             | 2 +-
 requirements_amd_noavx2.txt      | 2 +-
 requirements_apple_intel.txt     | 2 +-
 requirements_apple_silicon.txt   | 2 +-
 requirements_cpu_only.txt        | 2 +-
 requirements_cpu_only_noavx2.txt | 2 +-
 requirements_noavx2.txt          | 2 +-
 requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4ff7a6df..b4e358fb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,7 +21,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb
 
diff --git a/requirements_amd.txt b/requirements_amd.txt
index e30f30ee..0ceb9d9e 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb
 
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 15d25caa..330c73d1 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb
 
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index b614acf4..185e6cad 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb
 
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index ca9cc3ac..f70d1c43 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb
 
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index e9a97905..6467f996 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb
 
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index c4357676..cbebdbbc 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb
 
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 40cbc7b0..23d1c20d 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -21,7 +21,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb
 
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 3d6c922f..f20a7332 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb
 

From dba17c40fc67fd4e64a26214c47d745bf5a42d18 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 17 Feb 2025 17:31:11 -0800
Subject: [PATCH 02/16] Make transformers 4.49 functional

---
 modules/sampler_hijack.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index d202af1f..e0df49c3 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -5,7 +5,7 @@ import random
 
 import torch
 import transformers
-from transformers import LogitsWarper
+from transformers import LogitsProcessor
 from transformers.generation.logits_process import (
     LogitNormalization,
     LogitsProcessor,
@@ -19,7 +19,7 @@ from modules.models import get_device
 global_scores = None
 
 
-class TemperatureLogitsWarperCustom(LogitsWarper):
+class TemperatureLogitsWarperCustom(LogitsProcessor):
     '''
     A copy of the original Transformers temperature logits warper.
     '''
@@ -42,7 +42,7 @@ class TemperatureLogitsWarperCustom(LogitsWarper):
         return scores
 
 
-class DynamicTemperatureLogitsWarper(LogitsWarper):
+class DynamicTemperatureLogitsWarper(LogitsProcessor):
     '''
     Dynamic temperature.
     '''
@@ -100,7 +100,7 @@ class DynamicTemperatureLogitsWarper(LogitsWarper):
         return scores
 
 
-class QuadraticSamplingLogitsWarper(LogitsWarper):
+class QuadraticSamplingLogitsWarper(LogitsProcessor):
     '''
     Quadratic sampling with smoothing factor and smoothing curve parameters.
     '''
@@ -127,7 +127,7 @@ class QuadraticSamplingLogitsWarper(LogitsWarper):
         return transformed_logits
 
 
-class TailFreeLogitsWarper(LogitsWarper):
+class TailFreeLogitsWarper(LogitsProcessor):
     def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         tfs = float(tfs)
         if tfs < 0 or tfs > 1.0:
@@ -167,7 +167,7 @@ class TailFreeLogitsWarper(LogitsWarper):
         return scores
 
 
-class TopALogitsWarper(LogitsWarper):
+class TopALogitsWarper(LogitsProcessor):
     def __init__(self, top_a: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         top_a = float(top_a)
         if top_a < 0 or top_a > 1.0:
@@ -194,7 +194,7 @@ class TopALogitsWarper(LogitsWarper):
 
 
 # Exclude Top Choices (XTC)
-class XTCLogitsWarper(LogitsWarper):
+class XTCLogitsWarper(LogitsProcessor):
     def __init__(self, threshold: float, probability: float, filter_value: float = -float("Inf")):
         self.threshold = threshold
         self.probability = probability
@@ -312,7 +312,7 @@ class DRYLogitsProcessor(LogitsProcessor):
         return scores
 
 
-class MirostatLogitsWarper(LogitsWarper):
+class MirostatLogitsWarper(LogitsProcessor):
     def __init__(self, mirostat_mode: int, mirostat_tau: float, mirostat_eta: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         if mirostat_mode not in [2]:
             raise ValueError(f"`mirostat` has to be a an integer 2, but is {mirostat_mode}")
@@ -361,7 +361,7 @@ class MirostatLogitsWarper(LogitsWarper):
         return scores
 
 
-class SpyLogitsWarper(LogitsWarper):
+class SpyLogitsWarper(LogitsProcessor):
     def __init__(self):
         pass
 

From 12f6f7ba9ff327a67ee334e84da4e7f292819a25 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 17 Feb 2025 22:35:38 -0300
Subject: [PATCH 03/16] Update accelerate requirement from ==1.3.* to ==1.4.*
 (#6753)

---
 requirements.txt                 | 2 +-
 requirements_amd.txt             | 2 +-
 requirements_amd_noavx2.txt      | 2 +-
 requirements_apple_intel.txt     | 2 +-
 requirements_apple_silicon.txt   | 2 +-
 requirements_cpu_only.txt        | 2 +-
 requirements_cpu_only_noavx2.txt | 2 +-
 requirements_noavx2.txt          | 2 +-
 requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b4e358fb..d09f6bf5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-accelerate==1.3.*
+accelerate==1.4.*
 bitsandbytes==0.45.*
 colorama
 datasets
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 0ceb9d9e..124ad6b6 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -1,4 +1,4 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 330c73d1..9e3063c3 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 185e6cad..0ef8db34 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -1,4 +1,4 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index f70d1c43..9c4bcc11 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -1,4 +1,4 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 6467f996..9f19238a 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -1,4 +1,4 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index cbebdbbc..042dbbd8 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 23d1c20d..f488cafc 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,4 +1,4 @@
-accelerate==1.3.*
+accelerate==1.4.*
 bitsandbytes==0.45.*
 colorama
 datasets
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index f20a7332..1bc2e385 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -1,4 +1,4 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops

From 01f20d2d9f4e3a96556c6dce560d0068bc2eb492 Mon Sep 17 00:00:00 2001
From: Alireza Ghasemi <alireza.ghasemi@utwente.nl>
Date: Tue, 18 Feb 2025 02:38:15 +0100
Subject: [PATCH 04/16] Improve SuperboogaV2 with Date/Time Embeddings, GPU
 Support, and Multiple File Formats (#6748)

---
 extensions/superboogav2/README.md         |  42 ++++++-
 extensions/superboogav2/chromadb.py       |  36 +++---
 extensions/superboogav2/config.json       |   3 +
 extensions/superboogav2/data_processor.py |   8 ++
 extensions/superboogav2/optimize.py       |   6 +-
 extensions/superboogav2/parameters.py     |   8 ++
 extensions/superboogav2/requirements.txt  |  16 ++-
 extensions/superboogav2/script.py         | 146 ++++++++++++++++++++--
 8 files changed, 227 insertions(+), 38 deletions(-)

diff --git a/extensions/superboogav2/README.md b/extensions/superboogav2/README.md
index d25b3a5e..0460c401 100644
--- a/extensions/superboogav2/README.md
+++ b/extensions/superboogav2/README.md
@@ -1,5 +1,41 @@
-# superboogav2
+# SuperboogaV2
 
-For a description, please see the comments in this Pull Request:
+Enhance your LLM with additional information from text, URLs, and files for more accurate and context-aware responses.
 
-https://github.com/oobabooga/text-generation-webui/pull/3272
+---
+
+
+
+## Installation and Activation
+
+1. Start the conda environment by running `cmd_windows.bat` or the equivalent for your system in the root directory of `text-generation-webui`.
+2. Install the necessary packages:
+   ```
+   pip install -r extensions/superboogav2/requirements.txt
+   ```
+3. Activate the extension in the `Session` tab of the web UI.
+4. Click on `Apply flags/extensions and restart`. Optionally save the configuration by clicking on `Save UI defaults to settings.yaml`.
+
+## Usage and Features
+
+After activation, you can scroll further down in the chat UI to reveal the SuperboogaV2 interface. Here, you can add extra information to your chats through text input, multiple URLs, or by providing multiple files subject to the context window limit of your model.
+
+The extra information and the current date and time are provided to the model as embeddings that persist across conversations. To clear them, click the `Clear Data` button and start a new chat. You can adjust the text extraction parameters and other options in the `Settings`.
+
+## Supported File Formats
+
+SuperboogaV2 utilizes MuPDF, pandas, python-docx, and python-pptx to extract text from various file formats, including:
+
+- TXT
+- PDF
+- EPUB
+- HTML
+- CSV
+- ODT/ODS/ODP
+- DOCX/PPTX/XLSX
+
+## Additional Information
+
+SuperboogaV2 processes your data into context-aware chunks, applies cleaning techniques, and stores them as embeddings to minimize redundant computations. Relevance is determined using distance calculations and prioritization of recent information.
+
+For a detailed description and more information, refer to the comments in this pull request: [https://github.com/oobabooga/text-generation-webui/pull/3272](https://github.com/oobabooga/text-generation-webui/pull/3272)
diff --git a/extensions/superboogav2/chromadb.py b/extensions/superboogav2/chromadb.py
index 3381fb14..c9e450e4 100644
--- a/extensions/superboogav2/chromadb.py
+++ b/extensions/superboogav2/chromadb.py
@@ -1,7 +1,7 @@
 import math
 import random
 import threading
-
+import torch
 import chromadb
 import numpy as np
 import posthog
@@ -16,9 +16,6 @@ from modules.text_generation import decode, encode
 posthog.capture = lambda *args, **kwargs: None
 
 
-embedder = embedding_functions.SentenceTransformerEmbeddingFunction("sentence-transformers/all-mpnet-base-v2")
-
-
 class Info:
     def __init__(self, start_index, text_with_context, distance, id):
         self.text_with_context = text_with_context
@@ -77,11 +74,23 @@ class Info:
 
 class ChromaCollector():
     def __init__(self):
-        name = ''.join(random.choice('ab') for _ in range(10))
+        name = "".join(random.choice("ab") for _ in range(10))
 
         self.name = name
-        self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
-        self.collection = self.chroma_client.create_collection(name=name, embedding_function=embedder)
+        self.embedder = embedding_functions.SentenceTransformerEmbeddingFunction(
+            "sentence-transformers/all-mpnet-base-v2",
+            device=("cuda" if torch.cuda.is_available() else "cpu"),
+        )
+        chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
+        self.collection = chroma_client.create_collection(
+            name=self.name,
+            embedding_function=self.embedder,
+            metadata={
+                "hnsw:search_ef": 200,
+                "hnsw:construction_ef": 200,
+                "hnsw:M": 64,
+            },
+        )
 
         self.ids = []
         self.id_to_info = {}
@@ -110,7 +119,7 @@ class ChromaCollector():
 
             # If there are any non-existing texts, compute their embeddings all at once. Each call to embed has significant overhead.
             if non_existing_texts:
-                non_existing_embeddings = embedder(non_existing_texts)
+                non_existing_embeddings = self.embedder(non_existing_texts)
                 for text, embedding in zip(non_existing_texts, non_existing_embeddings):
                     self.embeddings_cache[text] = embedding
 
@@ -139,7 +148,7 @@ class ChromaCollector():
             id_ = new_ids[i]
             metadata = metadatas[i] if metadatas is not None else None
             embedding = self.embeddings_cache.get(text)
-            if embedding:
+            if embedding is not None and embedding.any():
                 existing_texts.append(text)
                 existing_embeddings.append(embedding)
                 existing_ids.append(id_)
@@ -323,6 +332,8 @@ class ChromaCollector():
     def delete(self, ids_to_delete: list[str], where: dict):
         with self.lock:
             ids_to_delete = self.collection.get(ids=ids_to_delete, where=where)['ids']
+            if not ids_to_delete:
+                return
             self.collection.delete(ids=ids_to_delete, where=where)
 
             # Remove the deleted ids from self.ids and self.id_to_info
@@ -335,12 +346,7 @@ class ChromaCollector():
 
     def clear(self):
         with self.lock:
-            self.chroma_client.reset()
-
-            self.ids = []
-            self.chroma_client.delete_collection(name=self.name)
-            self.collection = self.chroma_client.create_collection(name=self.name, embedding_function=embedder)
-
+            self.__init__()  # reinitialize the collector
             logger.info('Successfully cleared all records and reset chromaDB.')
 
 
diff --git a/extensions/superboogav2/config.json b/extensions/superboogav2/config.json
index 0f1034f5..5de3d870 100644
--- a/extensions/superboogav2/config.json
+++ b/extensions/superboogav2/config.json
@@ -127,6 +127,9 @@
       "default": "\n\n<<document end>>\n\n"
     },
     "manual": {
+      "default": false
+    },
+    "add_date_time": {
       "default": true
     },
     "add_chat_to_data": {
diff --git a/extensions/superboogav2/data_processor.py b/extensions/superboogav2/data_processor.py
index 0a96d4a4..3c5e5c9f 100644
--- a/extensions/superboogav2/data_processor.py
+++ b/extensions/superboogav2/data_processor.py
@@ -6,6 +6,7 @@ It will only include full words.
 
 import bisect
 import re
+from datetime import datetime
 
 import extensions.superboogav2.parameters as parameters
 
@@ -154,6 +155,13 @@ def process_and_add_to_collector(corpus: str, collector: ChromaCollector, clear_
     data_chunks_with_context = []
     data_chunk_starting_indices = []
 
+    if parameters.get_add_date_time():
+        now = datetime.now()
+        date_time_chunk = f"Current time is {now.strftime('%H:%M:%S')}. Today is {now.strftime('%A')}. The current date is {now.strftime('%Y-%m-%d')}."
+        data_chunks.append(date_time_chunk)
+        data_chunks_with_context.append(date_time_chunk)
+        data_chunk_starting_indices.append(0)
+
     # Handling chunk_regex
     if parameters.get_chunk_regex():
         if parameters.get_chunk_separator():
diff --git a/extensions/superboogav2/optimize.py b/extensions/superboogav2/optimize.py
index ebdd03c6..3597fdf1 100644
--- a/extensions/superboogav2/optimize.py
+++ b/extensions/superboogav2/optimize.py
@@ -39,11 +39,11 @@ def _markdown_hyperparams():
 # Convert numpy types to python types.
 def _convert_np_types(params):
     for key in params:
-        if type(params[key]) == np.bool_:
+        if isinstance(params[key], np.bool_):
             params[key] = bool(params[key])
-        elif type(params[key]) == np.int64:
+        elif isinstance(params[key], np.int64):
             params[key] = int(params[key])
-        elif type(params[key]) == np.float64:
+        elif isinstance(params[key], np.float64):
             params[key] = float(params[key])
     return params
 
diff --git a/extensions/superboogav2/parameters.py b/extensions/superboogav2/parameters.py
index 8bb2d1a6..e691dae1 100644
--- a/extensions/superboogav2/parameters.py
+++ b/extensions/superboogav2/parameters.py
@@ -251,6 +251,10 @@ def get_is_manual() -> bool:
     return bool(Parameters.getInstance().hyperparameters['manual']['default'])
 
 
+def get_add_date_time() -> bool:
+    return bool(Parameters.getInstance().hyperparameters['add_date_time']['default'])
+
+
 def get_add_chat_to_data() -> bool:
     return bool(Parameters.getInstance().hyperparameters['add_chat_to_data']['default'])
 
@@ -331,6 +335,10 @@ def set_manual(value: bool):
     Parameters.getInstance().hyperparameters['manual']['default'] = value
 
 
+def set_add_date_time(value: bool):
+    Parameters.getInstance().hyperparameters['add_date_time']['default'] = value
+
+
 def set_add_chat_to_data(value: bool):
     Parameters.getInstance().hyperparameters['add_chat_to_data']['default'] = value
 
diff --git a/extensions/superboogav2/requirements.txt b/extensions/superboogav2/requirements.txt
index d9031167..6de51e63 100644
--- a/extensions/superboogav2/requirements.txt
+++ b/extensions/superboogav2/requirements.txt
@@ -1,10 +1,16 @@
-beautifulsoup4==4.12.2
-chromadb==0.4.24
+beautifulsoup4==4.13.3
+chromadb==0.6.3
 lxml
+nltk
 optuna
-pandas==2.0.3
-posthog==2.4.2
-sentence_transformers==2.2.2
+pandas
+posthog==3.13.0
+sentence_transformers==3.3.1
 spacy
 pytextrank
 num2words
+PyMuPDF
+python-docx
+python-pptx
+openpyxl
+odfpy
\ No newline at end of file
diff --git a/extensions/superboogav2/script.py b/extensions/superboogav2/script.py
index 77c5cced..13c58df9 100644
--- a/extensions/superboogav2/script.py
+++ b/extensions/superboogav2/script.py
@@ -9,6 +9,13 @@ os.environ['NLTK_DATA'] = str(Path("extensions/superboogav2/nltk_data").resolve(
 
 import codecs
 import textwrap
+import docx
+import pptx
+import fitz
+fitz.TOOLS.mupdf_display_errors(False)
+import pandas as pd
+from odf.opendocument import load
+from odf.draw import Page
 
 import gradio as gr
 
@@ -46,11 +53,123 @@ def _feed_data_into_collector(corpus):
     yield '### Done.'
 
 
-def _feed_file_into_collector(file):
-    yield '### Reading and processing the input dataset...'
-    text = file.decode('utf-8')
-    process_and_add_to_collector(text, collector, False, create_metadata_source('file'))
-    yield '### Done.'
+def _feed_file_into_collector(files):
+    if not files:
+        logger.warning("No files selected.")
+        return
+
+    def read_binary_file(file_path):
+        try:
+            with open(file_path, 'rb') as f:
+                return f.read()
+        except Exception:
+            logger.error(f"Failed to read {file_path}.")
+            return None
+
+    def extract_with_utf8(text):
+        try:
+            return text.decode('utf-8')
+        except Exception:
+            return ""
+
+    def extract_with_fitz(file_content):
+        try:
+            with fitz.open(stream=file_content, filetype=None) as doc:
+                num_pages = doc.page_count
+                text = "\n".join(block[4] for page in doc for block in page.get_text("blocks") if block[6] == 0)
+                logger.info(f"Extracted text from {num_pages} pages with fitz.")
+                return text
+        except Exception:
+            return ""
+
+    def extract_with_docx(file_path):
+        try:
+            paragraphs = docx.Document(file_path).paragraphs
+            text = "\n".join(para.text for para in paragraphs)
+            logger.info(f"Extracted text from {len(paragraphs)} paragraphs with docx.")
+            return text
+        except Exception:
+            return ""
+
+    def extract_with_pptx(file_path):
+        try:
+            slides = pptx.Presentation(file_path).slides
+            text = "\n".join(
+                shape.text for slide in slides for shape in slide.shapes if hasattr(shape, "text")
+            )
+            logger.info(f"Extracted text from {len(slides)} slides with pptx.")
+            return text
+        except Exception:
+            return ""
+
+    def extract_with_odf(file_path):
+        if not file_path.endswith(".odp"):
+            return ""
+        try:
+            doc = load(file_path)
+            text_content = []
+
+            def extract_text(element):
+                parts = []
+                if hasattr(element, "childNodes"):
+                    for node in element.childNodes:
+                        if node.nodeType == node.TEXT_NODE:
+                            parts.append(node.data)
+                        else:
+                            parts.append(extract_text(node))
+                return "".join(parts)
+
+            for slide in doc.getElementsByType(Page):
+                slide_text = extract_text(slide)
+                if slide_text.strip():
+                    text_content.append(slide_text.strip())
+
+            text = "\n".join(text_content)
+            logger.info(f"Extracted text from {len(doc.getElementsByType(Page))} slides with odf.")
+            return text
+        except Exception as e:
+            logger.error(f"Failed to extract text from {file_path}: {str(e)}")
+            return ""
+
+    def extract_with_pandas(file_path):
+        try:
+            df = pd.read_excel(file_path)
+            text = "\n".join(str(cell) for col in df.columns for cell in df[col])
+            logger.info(f"Extracted text from {df.shape[0]}x{df.shape[1]} cells with pandas.")
+            return text
+        except Exception:
+            return ""
+
+    for index, file in enumerate(files, start=1):
+        file_name = os.path.basename(file)
+        logger.info(f"Processing {file_name}...")
+
+        file_content = read_binary_file(file)
+        if not file_content:
+            continue
+
+        text_extractors = [
+            lambda: extract_with_utf8(file_content),
+            lambda: extract_with_fitz(file_content),
+            lambda: extract_with_docx(file),
+            lambda: extract_with_pptx(file),
+            lambda: extract_with_odf(file),
+            lambda: extract_with_pandas(file),
+        ]
+
+        for extractor in text_extractors:
+            text = extractor()
+            if text:
+                break
+
+        if not text:
+            logger.error(f"Failed to extract text from {file_name}, unsupported format.")
+            continue
+
+        process_and_add_to_collector(text, collector, False, create_metadata_source(f"file-{index}"))
+
+    logger.info("Done.")
+    yield "### Done."
 
 
 def _feed_url_into_collector(urls):
@@ -107,7 +226,7 @@ def _get_optimizable_settings() -> list:
 
 
 def _apply_settings(optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
-                    preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count,
+                    preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, add_date_time, postfix, data_separator, prefix, max_token_count,
                     chunk_count, chunk_sep, context_len, chunk_regex, chunk_len, threads, strong_cleanup):
     logger.debug('Applying settings.')
 
@@ -124,6 +243,7 @@ def _apply_settings(optimization_steps, time_power, time_steepness, significant_
         parameters.set_injection_strategy(injection_strategy)
         parameters.set_add_chat_to_data(add_chat_to_data)
         parameters.set_manual(manual)
+        parameters.set_add_date_time(add_date_time)
         parameters.set_postfix(codecs.decode(postfix, 'unicode_escape'))
         parameters.set_data_separator(codecs.decode(data_separator, 'unicode_escape'))
         parameters.set_prefix(codecs.decode(prefix, 'unicode_escape'))
@@ -237,11 +357,11 @@ def ui():
                 url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.')
                 strong_cleanup = gr.Checkbox(value=parameters.get_is_strong_cleanup(), label='Strong cleanup', info='Only keeps html elements that look like long-form text.')
                 threads = gr.Number(value=parameters.get_num_threads(), label='Threads', info='The number of threads to use while downloading the URLs.', precision=0)
-                update_url = gr.Button('Load data')
+                update_urls = gr.Button('Load data')
 
             with gr.Tab("File input"):
-                file_input = gr.File(label='Input file', type='binary')
-                update_file = gr.Button('Load data')
+                file_input = gr.File(label="Input file", type="filepath", file_count="multiple")
+                update_files = gr.Button('Load data')
 
             with gr.Tab("Settings"):
                 with gr.Accordion("Processing settings", open=True):
@@ -258,6 +378,7 @@ def ui():
                     postfix = gr.Textbox(value=codecs.encode(parameters.get_postfix(), 'unicode_escape').decode(), label='Postfix', info='What to put after the injection point.')
                     with gr.Row():
                         manual = gr.Checkbox(value=parameters.get_is_manual(), label="Is Manual", info="Manually specify when to use ChromaDB. Insert `!c` at the start or end of the message to trigger a query.", visible=shared.is_chat())
+                        add_date_time = gr.Checkbox(value=parameters.get_add_date_time(), label="Add date and time to Data", info="Make the current date and time available to the model.", visible=shared.is_chat())
                         add_chat_to_data = gr.Checkbox(value=parameters.get_add_chat_to_data(), label="Add Chat to Data", info="Automatically feed the chat history as you chat.", visible=shared.is_chat())
                     injection_strategy = gr.Radio(choices=[parameters.PREPEND_TO_LAST, parameters.APPEND_TO_LAST, parameters.HIJACK_LAST_IN_CONTEXT], value=parameters.get_injection_strategy(), label='Injection Strategy', info='Where to inject the messages in chat or instruct mode.', visible=shared.is_chat())
                     with gr.Row():
@@ -313,14 +434,14 @@ def ui():
             last_updated = gr.Markdown()
 
     all_params = [optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
-                  preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count,
+                  preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, add_date_time, postfix, data_separator, prefix, max_token_count,
                   chunk_count, chunk_sep, context_len, chunk_regex, chunk_len, threads, strong_cleanup]
     optimizable_params = [time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
                           preprocess_pipeline, chunk_count, context_len, chunk_len]
 
     update_data.click(_feed_data_into_collector, [data_input], last_updated, show_progress=False)
-    update_url.click(_feed_url_into_collector, [url_input], last_updated, show_progress=False)
-    update_file.click(_feed_file_into_collector, [file_input], last_updated, show_progress=False)
+    update_urls.click(_feed_url_into_collector, [url_input], last_updated, show_progress=False)
+    update_files.click(_feed_file_into_collector, [file_input], last_updated, show_progress=False)
     benchmark_button.click(_begin_benchmark, [], last_updated, show_progress=True)
     optimize_button.click(_begin_optimization, [], [last_updated] + optimizable_params, show_progress=True)
     clear_button.click(_clear_data, [], last_updated, show_progress=False)
@@ -339,6 +460,7 @@ def ui():
     api_on.input(fn=_apply_settings, inputs=all_params, show_progress=False)
     injection_strategy.input(fn=_apply_settings, inputs=all_params, show_progress=False)
     add_chat_to_data.input(fn=_apply_settings, inputs=all_params, show_progress=False)
+    add_date_time.input(fn=_apply_settings, inputs=all_params, show_progress=False)
     manual.input(fn=_apply_settings, inputs=all_params, show_progress=False)
     postfix.input(fn=_apply_settings, inputs=all_params, show_progress=False)
     data_separator.input(fn=_apply_settings, inputs=all_params, show_progress=False)

From b131f865840aff5ccb7516535efc2c683f763cf1 Mon Sep 17 00:00:00 2001
From: SeanScripts <64337075+SeanScripts@users.noreply.github.com>
Date: Tue, 18 Feb 2025 06:56:28 -0800
Subject: [PATCH 05/16] Perplexity colors extension v2 (#6756)

---
 extensions/perplexity_colors/script.py | 275 ++++++++++++++++++-------
 1 file changed, 201 insertions(+), 74 deletions(-)

diff --git a/extensions/perplexity_colors/script.py b/extensions/perplexity_colors/script.py
index 2a986ac4..849e4e63 100644
--- a/extensions/perplexity_colors/script.py
+++ b/extensions/perplexity_colors/script.py
@@ -1,9 +1,14 @@
 import time
 
+import html
+import functools
+import re
+
 import gradio
 import numpy as np
 import torch
 from transformers import LogitsProcessor
+import colorsys
 
 from modules import html_generator, shared
 
@@ -28,7 +33,7 @@ class PerplexityLogits(LogitsProcessor):
         self.verbose = verbose
 
     def __call__(self, input_ids, scores):
-        # t0 = time.time()
+        #t0 = time.time()
         probs = torch.softmax(scores, dim=-1, dtype=torch.float)
         log_probs = torch.nan_to_num(torch.log(probs))  # Note: This is to convert log(0) nan to 0, but probs*log_probs makes this 0 not affect the perplexity.
         entropy = -torch.sum(probs * log_probs)
@@ -42,9 +47,8 @@ class PerplexityLogits(LogitsProcessor):
         if len(self.selected_probs) > 0:
             # Is the selected token in the top tokens?
             if self.verbose:
-                print('Probs: Token after', shared.tokenizer.decode(last_token_id))
-                print('Probs:', [shared.tokenizer.decode(token_id) for token_id in self.top_token_ids_list[-1][0]])
-                print('Probs:', [round(float(prob), 4) for prob in self.top_probs_list[-1][0]])
+                print(shared.tokenizer.decode(last_token_id), [shared.tokenizer.decode(token_id) for token_id in self.top_token_ids_list[-1][0]],
+                    [round(float(prob), 4) for prob in self.top_probs_list[-1][0]])
             if last_token_id in self.top_token_ids_list[-1][0]:
                 idx = self.top_token_ids_list[-1][0].index(last_token_id)
                 self.selected_probs.append(self.top_probs_list[-1][0][idx])
@@ -60,7 +64,7 @@ class PerplexityLogits(LogitsProcessor):
             pplbar = "-"
             if not np.isnan(perplexity):
                 pplbar = "*" * round(perplexity)
-            print(f"PPL: Token after {shared.tokenizer.decode(last_token_id)}\t{perplexity:.2f}\t{pplbar}")
+            print(f"PPL for token after {shared.tokenizer.decode(last_token_id)}: {perplexity:.2f} {pplbar}")
 
         # Get top 5 probabilities
         top_tokens_and_probs = torch.topk(probs, 5)
@@ -73,14 +77,15 @@ class PerplexityLogits(LogitsProcessor):
         probs = probs.cpu().numpy().flatten()
         self.last_probs = probs  # Need to keep this as a reference for top probs
 
-        # t1 = time.time()
-        # print(f"PPL Processor: {(t1-t0):.3f} s")
+        #t1 = time.time()
+        #print(f"PPL Processor: {(t1-t0):.3f} s")
         # About 1 ms, though occasionally up to around 100 ms, not sure why...
         # Doesn't actually modify the logits!
         return scores
 
 
 # Stores the perplexity and top probabilities
+# global ppl_logits_processor
 ppl_logits_processor = None
 
 
@@ -93,9 +98,9 @@ def logits_processor_modifier(logits_processor_list, input_ids):
 
 def output_modifier(text):
     global ppl_logits_processor
-    # t0 = time.time()
+    #t0 = time.time()
 
-    if not params['active']:
+    if not params['active'] or ppl_logits_processor is None:
         return text
 
     # TODO: It's probably more efficient to do this above rather than modifying all these lists
@@ -111,110 +116,147 @@ def output_modifier(text):
 
     end_part = '</div></div>' if params['probability_dropdown'] else '</span>'  # Helps with finding the index after replacing part of the text.
 
-    i = 0
-    for token, prob, ppl, top_tokens, top_probs in zip(gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
+    # Initial space added to deal with some tokenizers...
+    # Used to find where the message started generating, for working with "continue" generations
+    # Doesn't work for longer messages... Not sure how I should handle this
+    full_msg = shared.tokenizer.decode([token_id for token_id in gen_token_ids[:-1]]).strip()
+    # Space at the beginning to account for tokenization spaces...
+    text = ' ' + html.unescape(text)
+    # There was an issue with tab lengths being off by one...
+    # Seems like it might be model-dependent...
+    #text = re.sub(r'( {3,})', r'\1 ', text)
+    # Subtracting 2 to hopefully help with the tokenization spaces and continue issues,
+    # Though it's possible it could overwrite the previous token if it's the same in the last 2 chars
+    i = text.find(full_msg) - 2
+    if i < 0:
+        # Backup, try removing the extra whitespace (needed for continue)
+        i = text.find(full_msg.strip()) - 2
+        if i < 0:
+            i = 0
+
+    #i = 0
+    # Add token index for ability to regenerate from there
+    nonwhitespace_token_found = False
+    for index, token, prob, ppl, top_tokens, top_probs in zip(range(len(gen_tokens)), gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
+        # Somehow this works without issues, but not sure how...
+        if not nonwhitespace_token_found and token.strip() == '':
+            #print('Ignoring initial whitespace token...')
+            continue
+        nonwhitespace_token_found = True
+        max_prob = top_probs[0][0]
         color = 'ffffff'
         if params['color_by_probability'] and params['color_by_perplexity']:
-            color = probability_perplexity_color_scale(prob, ppl)
+            color = probability_perplexity_color_scale(prob, max_prob, ppl)
         elif params['color_by_perplexity']:
             color = perplexity_color_scale(ppl)
         elif params['color_by_probability']:
             color = probability_color_scale(prob)
-        if token in text[i:]:
+        if token.strip() in text[i:]:
             if params['probability_dropdown']:
-                text = text[:i] + text[i:].replace(token, add_dropdown_html(token, color, top_tokens, top_probs[0], ppl), 1)
+                text = text[:i] + text[i:].replace(token.replace('\n', ''), add_dropdown_html(token, index, color, top_tokens, top_probs[0], ppl), 1)
             else:
-                text = text[:i] + text[i:].replace(token, add_color_html(token, color), 1)
+                text = text[:i] + text[i:].replace(token.replace('\n', ''), add_color_html(token, color), 1)
+            
+            # This might be slightly inefficient
             i += text[i:].find(end_part) + len(end_part)
+        else:
+            print('Missing token:', token, '...', text[i:i+20])
 
     # Use full perplexity list for calculating the average here.
-    print('Average perplexity:', round(np.mean(ppl_logits_processor.perplexities_list[:-1]), 4))
-    # t1 = time.time()
-    # print(f"Modifier: {(t1-t0):.3f} s")
+    # Fix issue with mean of empty slice
+    if len(ppl_logits_processor.perplexities_list) > 1:
+        print('Average perplexity:', round(np.mean(ppl_logits_processor.perplexities_list[:-1]), 4))
+    #t1 = time.time()
+    #print(f"Output modifier: {(t1-t0):.3f} s")
     # About 50 ms
-    return text
+    return text.strip() # Remove extra beginning whitespace that some tokenizers add
 
 
 def probability_color_scale(prob):
     '''
     Green-yellow-red color scale
     '''
+    # hue (0.0 = red, 0.33 = green)
+    # saturation (0.0 = gray / white, 1.0 = normal, just leave at 1.0)
+    # brightness (0.0 = black, 1.0 = brightest, use something in between for better readability if you want...)
+    hue = prob * 0.33
+    rv, gv, bv = colorsys.hsv_to_rgb(hue, 1.0, 1.0)
+    # to hex
+    hex_col = f"{int(rv*255):02x}{int(gv*255):02x}{int(bv*255):02x}"
 
-    rv = 0
-    gv = 0
-    if prob <= 0.5:
-        rv = 'ff'
-        gv = hex(int(255 * prob * 2))[2:]
-        if len(gv) < 2:
-            gv = '0' * (2 - len(gv)) + gv
-    else:
-        rv = hex(int(255 - 255 * (prob - 0.5) * 2))[2:]
-        gv = 'ff'
-        if len(rv) < 2:
-            rv = '0' * (2 - len(rv)) + rv
-
-    return rv + gv + '00'
+    return hex_col
 
 
 def perplexity_color_scale(ppl):
     '''
     Red component only, white for 0 perplexity (sorry if you're not in dark mode)
     '''
-    value = hex(max(int(255.0 - params['ppl_scale'] * (float(ppl) - 1.0)), 0))[2:]
-    if len(value) < 2:
-        value = '0' * (2 - len(value)) + value
+    # hue (0.0 = red)
+    # saturation (1.0 = red)
+    # brightness (0.0 = black, 1.0 = red)
+    # scale saturation from white to red the higher the perplexity
 
-    return 'ff' + value + value
+    ppl = min(ppl, params['ppl_scale'])  # clip ppl to 0-params['ppl_scale'] for color scaling. 15 should be fine for clipping and scaling
+    sat = ppl / params['ppl_scale']
+    rv, gv, bv = colorsys.hsv_to_rgb(0.0, sat, 1.0)
+
+    # to hex
+    hex_col = f"{int(rv*255):02x}{int(gv*255):02x}{int(bv*255):02x}"
+    
+    return hex_col
 
 
-def probability_perplexity_color_scale(prob, ppl):
+def probability_perplexity_color_scale(prob, max_prob, ppl):
     '''
-    Green-yellow-red for probability and blue component for perplexity
+    Green-yellow-red for relative probability compared to maximum for the current token, and blue component for perplexity
     '''
-
-    rv = 0
-    gv = 0
-    bv = hex(min(max(int(params['ppl_scale'] * (float(ppl) - 1.0)), 0), 255))[2:]
-    if len(bv) < 2:
-        bv = '0' * (2 - len(bv)) + bv
-
-    if prob <= 0.5:
-        rv = 'ff'
-        gv = hex(int(255 * prob * 2))[2:]
-        if len(gv) < 2:
-            gv = '0' * (2 - len(gv)) + gv
-    else:
-        rv = hex(int(255 - 255 * (prob - 0.5) * 2))[2:]
-        gv = 'ff'
-        if len(rv) < 2:
-            rv = '0' * (2 - len(rv)) + rv
-
-    return rv + gv + bv
+    hue = prob/max_prob * 0.33
+    rv, gv, _ = colorsys.hsv_to_rgb(hue, 1.0, 1.0)
+    
+    ppl = min(ppl, params['ppl_scale'])  # clip ppl to 0-params['ppl_scale'] for color scaling. 15 should be fine for clipping and scaling
+    bv = ppl / params['ppl_scale']
+    
+    # to hex
+    hex_col = f"{int(rv*255):02x}{int(gv*255):02x}{int(bv*255):02x}"
+    
+    return hex_col
 
 
 def add_color_html(token, color):
-    return f'<span style="color: #{color}">{token}</span>'
+    output = ''
+    output += f'<span style="color: #{color}">{html.escape(repr(token)[1:-1])}</span>'
+    #if '\n' in token or '\r' in token: #token.isspace():
+    #    output += '<br>'
+    return output
 
 
-# TODO: Major issue: Applying this to too many tokens will cause a permanent slowdown in generation speed until the messages are removed from the history.
+# TODO: Might also need message index for the click-to-regenerate feature to work... For now it only works in the last message, which I think is fine.
+
+# TODO: Major issue: Applying this to too many tokens will cause a permanent slowdown in generation speed until the messages are removed from the history. The slowdown seems to be mostly resolved in the current version though
 # I think the issue is from HTML elements taking up space in the visible history, and things like history deepcopy add latency proportional to the size of the history.
 # Potential solution is maybe to modify the main generation code to send just the internal text and not the visible history, to avoid moving too much around.
 # I wonder if we can also avoid using deepcopy here.
-def add_dropdown_html(token, color, top_tokens, top_probs, perplexity=0):
-    html = f'<div class="hoverable"><span style="color: #{color}">{token}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
-    for token_option, prob in zip(top_tokens, top_probs):
+def add_dropdown_html(token, index, color, top_tokens, top_probs, perplexity=0):
+    #print("Token:", token, token.isspace(), '\n' in token or '\r' in token)
+    output = ''
+    # Use the repr to get characters like \n visible. Exclude the quotes around it
+    output += f'<div class="hoverable" id="tok_{index}"><span style="color: #{color}">{html.escape(repr(token)[1:-1])}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
+    for i, token_option, prob in zip(range(len(top_tokens)), top_tokens, top_probs):
         # TODO: Bold for selected token?
         # Using divs prevented the problem of divs inside spans causing issues.
         # Now the problem is that divs show the same whitespace of one space between every token.
         # There is probably some way to fix this in CSS that I don't know about.
         row_color = probability_color_scale(prob)
         row_class = ' class="selected"' if token_option == token else ''
-        html += f'<tr{row_class}><td style="color: #{row_color}">{token_option}</td><td style="color: #{row_color}">{prob:.4f}</td></tr>'
+        # This time we want to include the quotes around it so that we can see where the spaces are.
+        output += f'<tr{row_class}><td id="opt_{index}_{i}" style="color: #{row_color}">{html.escape(repr(token_option))}</td><td style="color: #{row_color}">{prob:.4f}</td></tr>'
     if perplexity != 0:
         ppl_color = perplexity_color_scale(perplexity)
-        html += f'<tr><td>Perplexity:</td><td style="color: #{ppl_color}">{perplexity:.4f}</td></tr>'
-    html += '</tbody></table></div></div>'
-    return html  # About 750 characters per token...
+        output += f'<tr><td>Perplexity:</td><td style="color: #{ppl_color}">{perplexity:.4f}</td></tr>'
+    output += '</tbody></table></div></div>'
+    #if '\n' in token or '\r' in token: #token.isspace():
+    #    output += '<br>' # I imagine this will cause problems sometimes
+    return output  # About 750 characters per token...
 
 
 def custom_css():
@@ -223,8 +265,8 @@ def custom_css():
             display: none;
             position: absolute;
             z-index: 50;
-            background-color: var(--block-background-fill);
-            box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+            background-color: var(--background-fill-secondary);
+            box-shadow: 0px 8px 16px 0px rgba(0,0,0,1.0);
             width: max-content;
             overflow: visible;
             padding: 5px;
@@ -238,7 +280,7 @@ def custom_css():
         }
 
         .dropdown-content tr.selected {
-            background-color: var(--block-label-background-fill);
+            background-color: var(--background-fill-primary);
         }
 
         .dropdown-content td {
@@ -267,21 +309,106 @@ def custom_css():
         # TODO: This makes the hover menus extend outside the bounds of the chat area, which is good.
         # However, it also makes the scrollbar disappear, which is bad.
         # The scroll bar needs to still be present. So for now, we can't see dropdowns that extend past the edge of the chat area.
-        #.chat {
-        #    overflow-y: auto;
-        #}
+        .chat {
+            overflow-y: auto;
+        }
     """
 
+def custom_js():
+    return """
+
+function sleep(ms) {
+    return new Promise(resolve => setTimeout(resolve, ms));
+}    
+
+// Note that this will only work as intended on the last agent message
+document.addEventListener("click", async function(event) {
+    //console.log(event.target);
+    const id = event.target.id;
+    if (id.includes("opt_")) {
+        const id_parts = id.split("_");
+        const token_index = id_parts[1];
+        const option_index = id_parts[2];
+        // Exclude the quotes and convert newlines... Not sure about the newlines though
+        // TODO: Seems like continuing generation from a newline causes problems whether you add it or not!
+        const token_string = event.target.innerHTML.substring(1, event.target.innerHTML.length-1).replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '');
+        //console.log(token_index + ", " + option_index + ", " + token_string);
+        // Get all the previous text (I'm sure there is a more efficient way to do this)
+        var msg_text = ""
+        const msg_html = event.target.parentElement.parentElement.parentElement.parentElement.parentElement.parentElement;
+        var msg_parts = msg_html.childNodes;
+        for (var i = 0; i < msg_parts.length; i++) {
+            var msg_part = msg_parts[i];
+            if (msg_part.nodeType === Node.ELEMENT_NODE) {
+                if (msg_part.nodeName == "DIV") {
+                    var current_token_index = msg_part.id.split("_")[1];
+                    if (current_token_index == token_index) {
+                        // Use the replacement token
+                        // TODO: Don't have access to the tokenizer here, and sometimes there needs to be a space added before this token
+                        msg_text += token_string //.replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '');
+                        break;
+                    }
+                    else {
+                        // Replace here or at the end?
+                        var text = msg_part.firstChild.innerHTML.replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '')
+                        msg_text += text;
+                    }
+                }
+                else {
+                    // Break tag (hacky workaround because the newline literal can't be parsed here)
+                    //msg_text += String.fromCharCode(10);
+                    // Do nothing???
+                }
+            }
+            else if (msg_part.nodeType === Node.TEXT_NODE) {
+                msg_text +=  msg_part.textContent;
+            }
+        }
+        var textbox = document.querySelector("#chat-input textarea");
+        textbox.focus();
+        textbox.value = msg_text.trimStart() // Fix initial tokenization spaces
+        //console.log(textbox.value);
+        
+        // Add some delays to make sure it's processed correctly. Without these, there's a chance the events don't go through correctly and it doesn't work
+        // It's unknown how long this will take, and probably depends on the size of the message...
+        // It would be better to somehow wait for gradio to update instead of waiting a fixed amount of time.
+        // Hopefully 1 second of delay before starting generation isn't unacceptable.
+        var inputEvent = new Event('input', {
+            bubbles: true,
+            cancelable: true,
+        });
+        textbox.dispatchEvent(inputEvent);
+        var changeEvent = new Event('change', {
+            bubbles: true,
+            cancelable: true,
+        });
+        textbox.dispatchEvent(changeEvent);
+        await sleep(250);
+        document.getElementById("Replace-last").click();
+        // This can take a while to execute
+        await sleep(750);
+        document.getElementById("Continue").click();
+    }
+});
+
+console.log("Custom JS for perplexity_colors loaded");
+"""
 
 # Monkeypatch applied to html_generator.py
 # We simply don't render markdown into HTML. We wrap everything in <pre> tags to preserve whitespace
 # formatting. If you're coloring tokens by perplexity or probability, or especially if you're using
 # the probability dropdown, you probably care more about seeing the tokens the model actually outputted
 # rather than rendering ```code blocks``` or *italics*.
+@functools.lru_cache(maxsize=4096)
 def convert_to_markdown(string):
     return '<pre>' + string + '</pre>'
 
+def convert_to_markdown_wrapped(string, use_cache=True):
+    if use_cache:
+        return convert_to_markdown(string)
+    return convert_to_markdown.__wrapped__(string)
 
+# This is still necessary for formatting to work correctly
 html_generator.convert_to_markdown = convert_to_markdown
 
 
@@ -298,7 +425,7 @@ def ui():
     def update_prob_dropdown_check(x):
         params.update({'probability_dropdown': x})
 
-    active_check = gradio.Checkbox(value=True, label="Compute probabilities and perplexity scores", info="Activate this extension. Note that this extension currently does not work with exllama or llama.cpp.")
+    active_check = gradio.Checkbox(value=True, label="Compute probabilities and perplexity scores", info="Activate this extension. Note that this extension currently does not work with llama.cpp, but it does work with ExLlamav2_HF and llamacpp_HF when set up correctly")
     color_by_ppl_check = gradio.Checkbox(value=False, label="Color by perplexity", info="Higher perplexity is more red. If also showing probability, higher perplexity has more blue component.")
     color_by_prob_check = gradio.Checkbox(value=False, label="Color by probability", info="Green-yellow-red linear scale, with 100% green, 50% yellow, 0% red.")
     prob_dropdown_check = gradio.Checkbox(value=False, label="Probability dropdown", info="Hover over a token to show a dropdown of top token probabilities. Currently slightly buggy with whitespace between tokens.")

From 16fa9215c4cfb0d280ca470db30bda1743afc068 Mon Sep 17 00:00:00 2001
From: Kelvie Wong <kelvie@kelvie.ca>
Date: Tue, 18 Feb 2025 07:01:30 -0800
Subject: [PATCH 06/16] Fix OpenAI API with new param (show_after), closes
 #6747 (#6749)

---------

Co-authored-by: oobabooga <oobabooga4@gmail.com>
---
 modules/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 2852aaf3..66ab8c74 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -412,7 +412,7 @@ def generate_chat_reply(text, state, regenerate=False, _continue=False, loading_
             yield history
             return
 
-    show_after = html.escape(state["show_after"]) if state["show_after"] else None
+    show_after = html.escape(state.get("show_after")) if state.get("show_after") else None
     for history in chatbot_wrapper(text, state, regenerate=regenerate, _continue=_continue, loading_message=loading_message, for_ui=for_ui):
         if show_after:
             after = history["visible"][-1][1].partition(show_after)[2] or "*Is thinking...*"

From a12e05d9c0818d5f4a709ddcbf4cd973e4aa7f05 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 12 Mar 2025 16:11:03 -0300
Subject: [PATCH 07/16] Bump jinja2 from 3.1.5 to 3.1.6 (#6786)

---
 requirements.txt                 | 2 +-
 requirements_amd.txt             | 2 +-
 requirements_amd_noavx2.txt      | 2 +-
 requirements_apple_intel.txt     | 2 +-
 requirements_apple_silicon.txt   | 2 +-
 requirements_cpu_only.txt        | 2 +-
 requirements_cpu_only_noavx2.txt | 2 +-
 requirements_noavx2.txt          | 2 +-
 requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d09f6bf5..a4812ac5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 124ad6b6..972979bc 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -4,7 +4,7 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 9e3063c3..d40db513 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -4,7 +4,7 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 0ef8db34..e82c198a 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -4,7 +4,7 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 9c4bcc11..e23f465e 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -4,7 +4,7 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 9f19238a..fc5c52eb 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -4,7 +4,7 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 042dbbd8..1201ab05 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -4,7 +4,7 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index f488cafc..38df6e69 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -5,7 +5,7 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 1bc2e385..858ffff5 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -4,7 +4,7 @@ datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*

From 39fded487af6c80bcbf353efcd7562494e7c9cb2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 12 Mar 2025 17:54:30 -0700
Subject: [PATCH 08/16] Bump ExllamaV2 to 0.2.8

---
 requirements.txt               | 10 +++++-----
 requirements_amd.txt           |  6 +++---
 requirements_amd_noavx2.txt    |  6 +++---
 requirements_apple_intel.txt   |  2 +-
 requirements_apple_silicon.txt |  2 +-
 requirements_noavx2.txt        | 10 +++++-----
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a4812ac5..77de5853 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -50,11 +50,11 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 972979bc..761805e1 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -39,6 +39,6 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.7+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.7+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index d40db513..799044b3 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -37,6 +37,6 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index e82c198a..435201ff 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -35,4 +35,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index e23f465e..34e8a3b6 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -37,4 +37,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 38df6e69..cd41d69f 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -50,11 +50,11 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From 0261338910ba4a0f0f7d117d5152ae1fa938b70c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 12 Mar 2025 17:55:25 -0700
Subject: [PATCH 09/16] Bump llama-cpp-python to 0.3.8

---
 requirements.txt                 | 24 ++++++++++++------------
 requirements_amd.txt             | 12 ++++++------
 requirements_amd_noavx2.txt      |  8 ++++----
 requirements_apple_intel.txt     |  8 ++++----
 requirements_apple_silicon.txt   | 12 ++++++------
 requirements_cpu_only.txt        |  8 ++++----
 requirements_cpu_only_noavx2.txt |  8 ++++----
 requirements_noavx2.txt          | 24 ++++++++++++------------
 8 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 77de5853..83bd3a53 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,22 +32,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, with GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, without GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 761805e1..1e757ffe 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -31,14 +31,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.7+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.7+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.8+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.8+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 799044b3..f74ebf69 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -31,10 +31,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 435201ff..dcdeae3f 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -31,8 +31,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp310-cp310-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 34e8a3b6..b823e40e 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -31,10 +31,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp310-cp310-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index fc5c52eb..fe3f522a 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 1201ab05..014e2e5d 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index cd41d69f..6139c46e 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -32,22 +32,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, with GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, without GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

From f04a37adc29304cd8216ea38d0af2ea6722947ab Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 14 Mar 2025 05:20:15 -0700
Subject: [PATCH 10/16] UI: improved scrollbar styles

---
 css/main.css | 54 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/css/main.css b/css/main.css
index 23492338..b4544693 100644
--- a/css/main.css
+++ b/css/main.css
@@ -248,40 +248,58 @@ button {
     font-size: 100% !important;
 }
 
+.pretty_scrollbar {
+    scrollbar-width: thin;
+    scrollbar-color: rgb(140 140 140 / 35%) transparent;
+}
+
+.dark .pretty_scrollbar {
+    scrollbar-width: thin;
+    scrollbar-color: rgb(204 204 204 / 35%) transparent;
+}
+
 .pretty_scrollbar::-webkit-scrollbar {
-    width: 7px;
-    height: 7px;
+    width: 8px;
+    height: 8px;
 }
 
 .pretty_scrollbar::-webkit-scrollbar-track {
     background: transparent;
 }
 
-.pretty_scrollbar::-webkit-scrollbar-thumb,
+.pretty_scrollbar::-webkit-scrollbar-thumb {
+    background: rgb(140 140 140 / 35%);
+    border-radius: 4px;
+    transition: background 0.2s ease;
+}
+
 .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: var(--neutral-300);
-    border-radius: 30px;
+    background: rgb(140 140 140 / 55%);
+}
+
+.dark .pretty_scrollbar::-webkit-scrollbar-thumb {
+    background: rgb(204 204 204 / 35%);
+    border-radius: 4px;
 }
 
-.dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: #ccc;
-    border-radius: 10px;
-}
-
-.pretty_scrollbar::-webkit-resizer {
-    background: #c5c5d2;
-}
-
-.dark .pretty_scrollbar::-webkit-resizer {
-    background: #ccc;
-    border-radius: 10px;
+    background: rgb(204 204 204 / 55%);
 }
 
 .pretty_scrollbar::-webkit-scrollbar-corner {
     background: transparent;
 }
 
+.pretty_scrollbar::-webkit-resizer {
+    background: rgb(140 140 140 / 35%);
+    border-radius: 4px;
+}
+
+.dark .pretty_scrollbar::-webkit-resizer {
+    background: rgb(204 204 204 / 35%);
+    border-radius: 4px;
+}
+
 audio {
     max-width: 100%;
 }
@@ -295,7 +313,7 @@ audio {
     width: 0;
     text-align: left;
     direction: rtl;
-    right: 5px;
+    right: 13px;
 }
 
 #default-token-counter {

From 26317a4c7e371c59e03be4ac39c62505c43f7aad Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 14 Mar 2025 10:59:05 -0700
Subject: [PATCH 11/16] Fix jinja2 error while loading c4ai-command-a-03-2025

---
 modules/chat.py                    |  7 +++++-
 modules/llama_cpp_python_hijack.py | 40 ++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 66ab8c74..fd949907 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -11,6 +11,7 @@ from pathlib import Path
 
 import gradio as gr
 import yaml
+from jinja2.ext import loopcontrols
 from jinja2.sandbox import ImmutableSandboxedEnvironment
 from PIL import Image
 
@@ -35,7 +36,11 @@ def strftime_now(format):
     return datetime.now().strftime(format)
 
 
-jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
+jinja_env = ImmutableSandboxedEnvironment(
+    trim_blocks=True,
+    lstrip_blocks=True,
+    extensions=[loopcontrols]
+)
 jinja_env.globals["strftime_now"] = strftime_now
 
 
diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py
index f3872a74..c03c28a7 100644
--- a/modules/llama_cpp_python_hijack.py
+++ b/modules/llama_cpp_python_hijack.py
@@ -121,5 +121,45 @@ def monkey_patch_llama_cpp_python(lib):
     lib.Llama.original_generate = lib.Llama.generate
     lib.Llama.generate = my_generate
 
+    # Also patch Jinja2ChatFormatter to handle loop controls
+    if hasattr(lib, 'llama_chat_format') and hasattr(lib.llama_chat_format, 'Jinja2ChatFormatter'):
+        Formatter = lib.llama_chat_format.Jinja2ChatFormatter
+
+        if not getattr(Formatter, '_is_patched', False):
+            def patched_init(self, *args, **kwargs):
+                # Extract parameters from args or kwargs
+                if args:
+                    self.template = args[0]
+                    self.eos_token = args[1] if len(args) > 1 else kwargs.get('eos_token')
+                    self.bos_token = args[2] if len(args) > 2 else kwargs.get('bos_token')
+                    self.add_generation_prompt = args[3] if len(args) > 3 else kwargs.get('add_generation_prompt', True)
+                    self.stop_token_ids = args[4] if len(args) > 4 else kwargs.get('stop_token_ids')
+                else:
+                    self.template = kwargs.get('template')
+                    self.eos_token = kwargs.get('eos_token')
+                    self.bos_token = kwargs.get('bos_token')
+                    self.add_generation_prompt = kwargs.get('add_generation_prompt', True)
+                    self.stop_token_ids = kwargs.get('stop_token_ids')
+
+                # Process stop tokens as in the original
+                self.stop_token_ids = (
+                    set(self.stop_token_ids) if self.stop_token_ids is not None else None
+                )
+
+                # Create environment with loopcontrols extension
+                from jinja2.ext import loopcontrols
+                import jinja2
+
+                self._environment = jinja2.sandbox.ImmutableSandboxedEnvironment(
+                    loader=jinja2.BaseLoader(),
+                    trim_blocks=True,
+                    lstrip_blocks=True,
+                    extensions=[loopcontrols]
+                ).from_string(self.template)
+
+            # Replace the original __init__ with our patched version
+            Formatter.__init__ = patched_init
+            Formatter._is_patched = True
+
     # Set the flag to indicate that the patch has been applied
     lib.Llama._is_patched = True

From 6ab04698f6f74ac127156807698087bcb68dac3f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:03:49 -0700
Subject: [PATCH 12/16] UI: improve the light mode left sidebar color

---
 css/main.css | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index b4544693..8082e3cc 100644
--- a/css/main.css
+++ b/css/main.css
@@ -249,7 +249,7 @@ button {
 }
 
 .pretty_scrollbar {
-    scrollbar-width: thin;
+    scrollbar-width: thin !important;
     scrollbar-color: rgb(140 140 140 / 35%) transparent;
 }
 
@@ -1181,7 +1181,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .header_bar button.selected {
-    background: white;
+    background: #E0E0E0;
 }
 
 #chat-controls,

From 677d74a6a0e6bec0417caa6dde002af00b194328 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:09:56 -0700
Subject: [PATCH 13/16] Revert "UI: improved scrollbar styles", add just a
 small change instead

---
 css/main.css | 48 +++++++++++++++---------------------------------
 1 file changed, 15 insertions(+), 33 deletions(-)

diff --git a/css/main.css b/css/main.css
index 8082e3cc..a3fa9753 100644
--- a/css/main.css
+++ b/css/main.css
@@ -248,16 +248,6 @@ button {
     font-size: 100% !important;
 }
 
-.pretty_scrollbar {
-    scrollbar-width: thin !important;
-    scrollbar-color: rgb(140 140 140 / 35%) transparent;
-}
-
-.dark .pretty_scrollbar {
-    scrollbar-width: thin;
-    scrollbar-color: rgb(204 204 204 / 35%) transparent;
-}
-
 .pretty_scrollbar::-webkit-scrollbar {
     width: 8px;
     height: 8px;
@@ -267,39 +257,31 @@ button {
     background: transparent;
 }
 
-.pretty_scrollbar::-webkit-scrollbar-thumb {
-    background: rgb(140 140 140 / 35%);
-    border-radius: 4px;
-    transition: background 0.2s ease;
-}
-
+.pretty_scrollbar::-webkit-scrollbar-thumb,
 .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: rgb(140 140 140 / 55%);
-}
-
-.dark .pretty_scrollbar::-webkit-scrollbar-thumb {
-    background: rgb(204 204 204 / 35%);
-    border-radius: 4px;
+    background: var(--neutral-300);
+    border-radius: 30px;
 }
 
+.dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: rgb(204 204 204 / 55%);
+    background: #ccc;
+    border-radius: 10px;
+}
+
+.pretty_scrollbar::-webkit-resizer {
+    background: #c5c5d2;
+}
+
+.dark .pretty_scrollbar::-webkit-resizer {
+    background: #ccc;
+    border-radius: 10px;
 }
 
 .pretty_scrollbar::-webkit-scrollbar-corner {
     background: transparent;
 }
 
-.pretty_scrollbar::-webkit-resizer {
-    background: rgb(140 140 140 / 35%);
-    border-radius: 4px;
-}
-
-.dark .pretty_scrollbar::-webkit-resizer {
-    background: rgb(204 204 204 / 35%);
-    border-radius: 4px;
-}
-
 audio {
     max-width: 100%;
 }

From 5bcd2d7ad01a211198df1a133a29cfb682336e0b Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Fri, 14 Mar 2025 16:45:11 -0300
Subject: [PATCH 14/16] Add the top N-sigma sampler (#6796)

---
 extensions/openai/typing.py |  1 +
 modules/loaders.py          |  3 +++
 modules/presets.py          |  3 ++-
 modules/sampler_hijack.py   | 53 +++++++++++++++++++++++++++++++++++--
 modules/text_generation.py  |  1 +
 modules/ui.py               |  1 +
 modules/ui_parameters.py    |  1 +
 7 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 5f0e0128..ea688897 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -21,6 +21,7 @@ class GenerationOptions(BaseModel):
     eta_cutoff: float = 0
     tfs: float = 1
     top_a: float = 0
+    top_n_sigma: float = 0
     dry_multiplier: float = 0
     dry_allowed_length: int = 2
     dry_base: float = 1.75
diff --git a/modules/loaders.py b/modules/loaders.py
index cd864e40..88ded1d1 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -137,6 +137,7 @@ def transformers_samplers():
         'eta_cutoff',
         'tfs',
         'top_a',
+        'top_n_sigma',
         'dry_multiplier',
         'dry_allowed_length',
         'dry_base',
@@ -224,6 +225,7 @@ loaders_samplers = {
         'eta_cutoff',
         'tfs',
         'top_a',
+        'top_n_sigma',
         'dry_multiplier',
         'dry_allowed_length',
         'dry_base',
@@ -288,6 +290,7 @@ loaders_samplers = {
         'eta_cutoff',
         'tfs',
         'top_a',
+        'top_n_sigma',
         'dry_multiplier',
         'dry_allowed_length',
         'dry_base',
diff --git a/modules/presets.py b/modules/presets.py
index b841af53..7cab2af0 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -28,6 +28,7 @@ def default_preset():
         'eta_cutoff': 0,
         'tfs': 1,
         'top_a': 0,
+        'top_n_sigma': 0,
         'dry_multiplier': 0,
         'dry_allowed_length': 2,
         'dry_base': 1.75,
@@ -45,7 +46,7 @@ def default_preset():
         'do_sample': True,
         'dynamic_temperature': False,
         'temperature_last': False,
-        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
+        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
         'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
     }
 
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
index e0df49c3..e6883289 100644
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@@ -5,7 +5,6 @@ import random
 
 import torch
 import transformers
-from transformers import LogitsProcessor
 from transformers.generation.logits_process import (
     LogitNormalization,
     LogitsProcessor,
@@ -193,6 +192,46 @@ class TopALogitsWarper(LogitsProcessor):
         return scores
 
 
+class TopNSigmaLogitsWarper(LogitsProcessor):
+    def __init__(self, n_sigma: float = 2.0, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        """
+        Initialize Top-nσ Sampling logits warper.
+
+        Args:
+            n_sigma: The threshold multiplier for standard deviation
+            filter_value: Value to assign to filtered logits
+            min_tokens_to_keep: Minimum number of tokens to keep
+        """
+        if n_sigma < 0:
+            raise ValueError(f"`n_sigma` must be a non-negative float, but is {n_sigma}")
+        self.n_sigma = n_sigma
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # Calculate max of logits
+        max_logit = torch.max(scores, dim=-1, keepdim=True)[0]
+
+        # Calculate standard deviation only on finite values
+        finite_mask = torch.isfinite(scores)
+        finite_scores = scores.masked_fill(~finite_mask, 0.0)
+        std_logit = torch.std(finite_scores, dim=-1, keepdim=True)
+
+        # Create mask where tokens with logits >= max_logit - n_sigma * std_logit are kept
+        threshold = max_logit - self.n_sigma * std_logit
+        indices_to_remove = scores < threshold
+
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep tokens
+            top_k_indices = torch.topk(scores, self.min_tokens_to_keep, dim=-1)[1]
+            indices_to_remove.scatter_(-1, top_k_indices, False)
+
+        # Apply mask by setting filtered tokens to filter_value
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+
+        return scores
+
+
 # Exclude Top Choices (XTC)
 class XTCLogitsWarper(LogitsProcessor):
     def __init__(self, threshold: float, probability: float, filter_value: float = -float("Inf")):
@@ -525,6 +564,14 @@ def get_logits_processor_patch(self, **kwargs):
             )
         )
 
+    if generation_config.top_n_sigma is not None and generation_config.top_n_sigma > 0.0:
+        warpers_to_add.append(
+            TopNSigmaLogitsWarper(
+                n_sigma=generation_config.top_n_sigma,
+                min_tokens_to_keep=min_tokens_to_keep
+            )
+        )
+
     if generation_config.xtc_probability is not None and generation_config.xtc_probability > 0:
         warpers_to_add.append(
             XTCLogitsWarper(
@@ -589,6 +636,7 @@ def get_logits_processor_patch(self, **kwargs):
         'TailFreeLogitsWarper': 'tfs',
         'TemperatureLogitsWarperCustom': 'temperature',
         'TopALogitsWarper': 'top_a',
+        'TopNSigmaLogitsWarper': 'top_n_sigma',
         'TopKLogitsWarper': 'top_k',
         'TopPLogitsWarper': 'top_p',
         'TypicalLogitsWarper': 'typical_p',
@@ -636,6 +684,7 @@ def generation_config_init_patch(self, **kwargs):
     self.smoothing_curve = kwargs.pop("smoothing_curve", 1.0)
     self.tfs = kwargs.pop("tfs", 1.0)
     self.top_a = kwargs.pop("top_a", 0.0)
+    self.top_n_sigma = kwargs.pop("top_n_sigma", 0.0)
     self.mirostat_mode = kwargs.pop("mirostat_mode", 0)
     self.mirostat_eta = kwargs.pop("mirostat_eta", 0.1)
     self.mirostat_tau = kwargs.pop("mirostat_tau", 5)
@@ -649,7 +698,7 @@ def generation_config_init_patch(self, **kwargs):
     self.xtc_threshold = kwargs.pop("xtc_threshold", 0.1)
     self.xtc_probability = kwargs.pop("xtc_probability", 0)
     self.temperature_last = kwargs.pop("temperature_last", False)
-    self.sampler_priority = kwargs.pop("sampler_priority", ['repetition_penalty', 'presence_penalty', 'frequency_penalty', 'dry', 'temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat', 'xtc', 'encoder_repetition_penalty', 'no_repeat_ngram'])
+    self.sampler_priority = kwargs.pop("sampler_priority", ['repetition_penalty', 'presence_penalty', 'frequency_penalty', 'dry', 'temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_n_sigma', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat', 'xtc', 'encoder_repetition_penalty', 'no_repeat_ngram'])
 
 
 def hijack_samplers():
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 152b2b8d..eff6495e 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -302,6 +302,7 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
         'xtc_probability',
         'tfs',
         'top_a',
+        'top_n_sigma',
         'dry_multiplier',
         'dry_allowed_length',
         'dry_base',
diff --git a/modules/ui.py b/modules/ui.py
index b776e19c..adbb67b0 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -183,6 +183,7 @@ def list_interface_input_elements():
         'eta_cutoff',
         'tfs',
         'top_a',
+        'top_n_sigma',
         'dry_multiplier',
         'dry_allowed_length',
         'dry_base',
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 265840ed..846fcfe7 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -37,6 +37,7 @@ def create_ui(default_preset):
 
                             gr.Markdown('## Curve cutoff')
                             shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
+                            shared.gradio['top_n_sigma'] = gr.Slider(0.0, 5.0, value=generate_params['top_n_sigma'], step=0.01, label='top_n_sigma')
                             shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
                             shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
                             shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')

From 60d67994d9703c44ff1f56fd3aa8a51d0e7658ad Mon Sep 17 00:00:00 2001
From: SeanScripts <64337075+SeanScripts@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:45:53 -0700
Subject: [PATCH 15/16] Perplexity colors extension updates (#6764)

---
 extensions/perplexity_colors/script.py | 80 ++++++++++++++++++--------
 1 file changed, 55 insertions(+), 25 deletions(-)

diff --git a/extensions/perplexity_colors/script.py b/extensions/perplexity_colors/script.py
index 849e4e63..d032cebd 100644
--- a/extensions/perplexity_colors/script.py
+++ b/extensions/perplexity_colors/script.py
@@ -96,23 +96,42 @@ def logits_processor_modifier(logits_processor_list, input_ids):
         logits_processor_list.append(ppl_logits_processor)
 
 
+def get_last_token(text, tokens_list, token_ids_list, token_probs_list):
+    for token, token_id, prob in zip(tokens_list, token_ids_list, token_probs_list):
+        if text.strip().endswith(token.strip()): # Whitespace could be a problem
+            return token, token_id, prob
+    # Unknown?
+    print("Last token not found in list:", tokens_list)
+    return '', -1, 0.0
+
+
 def output_modifier(text):
     global ppl_logits_processor
     #t0 = time.time()
+    original_text = text
 
     if not params['active'] or ppl_logits_processor is None:
         return text
 
+    # Space at the beginning to account for tokenization spaces...
+    text = ' ' + html.unescape(text)
+
     # TODO: It's probably more efficient to do this above rather than modifying all these lists
     # Remove last element of perplexities_list, top_token_ids_list, top_tokens_list, top_probs_list since everything is off by one because this extension runs before generation
-    perplexities = ppl_logits_processor.perplexities_list[:-1]
-    top_token_ids_list = ppl_logits_processor.top_token_ids_list[:-1]
+    perplexities = ppl_logits_processor.perplexities_list
+    top_token_ids_list = ppl_logits_processor.top_token_ids_list
     top_tokens_list = [[shared.tokenizer.decode(token_id) for token_id in top_token_ids[0]] for top_token_ids in top_token_ids_list]
-    top_probs_list = ppl_logits_processor.top_probs_list[:-1]
+    top_probs_list = ppl_logits_processor.top_probs_list
     # Remove first element of generated_token_ids, generated_tokens, selected_probs because they are for the last token of the prompt
     gen_token_ids = ppl_logits_processor.generated_token_ids[1:]
+    # Add last sampled token, if possible (it could be past the end of the top 5 list)
+    last_token, last_token_id, last_prob = get_last_token(text, top_tokens_list[-1], top_token_ids_list[-1][0], top_probs_list[-1][0])
+    if last_token_id != -1:
+        gen_token_ids.append(last_token_id)
     gen_tokens = [shared.tokenizer.decode(token_id) for token_id in gen_token_ids]
     sel_probs = ppl_logits_processor.selected_probs[1:]
+    if last_token_id != -1:
+        sel_probs.append(last_prob)
 
     end_part = '</div></div>' if params['probability_dropdown'] else '</span>'  # Helps with finding the index after replacing part of the text.
 
@@ -120,8 +139,7 @@ def output_modifier(text):
     # Used to find where the message started generating, for working with "continue" generations
     # Doesn't work for longer messages... Not sure how I should handle this
     full_msg = shared.tokenizer.decode([token_id for token_id in gen_token_ids[:-1]]).strip()
-    # Space at the beginning to account for tokenization spaces...
-    text = ' ' + html.unescape(text)
+    
     # There was an issue with tab lengths being off by one...
     # Seems like it might be model-dependent...
     #text = re.sub(r'( {3,})', r'\1 ', text)
@@ -137,6 +155,7 @@ def output_modifier(text):
     #i = 0
     # Add token index for ability to regenerate from there
     nonwhitespace_token_found = False
+    missing_token_count = 0
     for index, token, prob, ppl, top_tokens, top_probs in zip(range(len(gen_tokens)), gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
         # Somehow this works without issues, but not sure how...
         if not nonwhitespace_token_found and token.strip() == '':
@@ -153,14 +172,20 @@ def output_modifier(text):
             color = probability_color_scale(prob)
         if token.strip() in text[i:]:
             if params['probability_dropdown']:
-                text = text[:i] + text[i:].replace(token.replace('\n', ''), add_dropdown_html(token, index, color, top_tokens, top_probs[0], ppl), 1)
+                text = text[:i] + text[i:].replace(token.replace('\n', ''), add_dropdown_html(token, index, i, color, top_tokens, top_probs[0], ppl), 1)
             else:
                 text = text[:i] + text[i:].replace(token.replace('\n', ''), add_color_html(token, color), 1)
             
             # This might be slightly inefficient
             i += text[i:].find(end_part) + len(end_part)
         else:
+            missing_token_count += 1
             print('Missing token:', token, '...', text[i:i+20])
+            # If there are any missing tokens, then either the tokenization was off, or this is the start of a conversation, or something else went wrong
+        if missing_token_count > 5:
+            print("Canceling token coloring...")
+            return original_text
+
 
     # Use full perplexity list for calculating the average here.
     # Fix issue with mean of empty slice
@@ -236,11 +261,11 @@ def add_color_html(token, color):
 # I think the issue is from HTML elements taking up space in the visible history, and things like history deepcopy add latency proportional to the size of the history.
 # Potential solution is maybe to modify the main generation code to send just the internal text and not the visible history, to avoid moving too much around.
 # I wonder if we can also avoid using deepcopy here.
-def add_dropdown_html(token, index, color, top_tokens, top_probs, perplexity=0):
+def add_dropdown_html(token, index, msg_position, color, top_tokens, top_probs, perplexity=0):
     #print("Token:", token, token.isspace(), '\n' in token or '\r' in token)
     output = ''
     # Use the repr to get characters like \n visible. Exclude the quotes around it
-    output += f'<div class="hoverable" id="tok_{index}"><span style="color: #{color}">{html.escape(repr(token)[1:-1])}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
+    output += f'<div class="hoverable" name="tok_{index}_{msg_position}"><span style="color: #{color}">{html.escape(repr(token)[1:-1])}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
     for i, token_option, prob in zip(range(len(top_tokens)), top_tokens, top_probs):
         # TODO: Bold for selected token?
         # Using divs prevented the problem of divs inside spans causing issues.
@@ -249,7 +274,7 @@ def add_dropdown_html(token, index, color, top_tokens, top_probs, perplexity=0):
         row_color = probability_color_scale(prob)
         row_class = ' class="selected"' if token_option == token else ''
         # This time we want to include the quotes around it so that we can see where the spaces are.
-        output += f'<tr{row_class}><td id="opt_{index}_{i}" style="color: #{row_color}">{html.escape(repr(token_option))}</td><td style="color: #{row_color}">{prob:.4f}</td></tr>'
+        output += f'<tr{row_class}><td name="opt_{index}_{i}_{msg_position}" style="color: #{row_color}">{html.escape(repr(token_option))}</td><td style="color: #{row_color}">{prob:.4f}</td></tr>'
     if perplexity != 0:
         ppl_color = perplexity_color_scale(perplexity)
         output += f'<tr><td>Perplexity:</td><td style="color: #{ppl_color}">{perplexity:.4f}</td></tr>'
@@ -324,11 +349,12 @@ function sleep(ms) {
 // Note that this will only work as intended on the last agent message
 document.addEventListener("click", async function(event) {
     //console.log(event.target);
-    const id = event.target.id;
-    if (id.includes("opt_")) {
-        const id_parts = id.split("_");
-        const token_index = id_parts[1];
-        const option_index = id_parts[2];
+    const name = event.target.getAttribute("name");
+    if (name != null && name.includes("opt_")) {
+        const name_parts = name.split("_");
+        const token_index = name_parts[1];
+        const option_index = name_parts[2];
+        const msg_pos = name_parts[3];
         // Exclude the quotes and convert newlines... Not sure about the newlines though
         // TODO: Seems like continuing generation from a newline causes problems whether you add it or not!
         const token_string = event.target.innerHTML.substring(1, event.target.innerHTML.length-1).replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '');
@@ -341,17 +367,21 @@ document.addEventListener("click", async function(event) {
             var msg_part = msg_parts[i];
             if (msg_part.nodeType === Node.ELEMENT_NODE) {
                 if (msg_part.nodeName == "DIV") {
-                    var current_token_index = msg_part.id.split("_")[1];
-                    if (current_token_index == token_index) {
-                        // Use the replacement token
-                        // TODO: Don't have access to the tokenizer here, and sometimes there needs to be a space added before this token
-                        msg_text += token_string //.replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '');
-                        break;
-                    }
-                    else {
-                        // Replace here or at the end?
-                        var text = msg_part.firstChild.innerHTML.replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '')
-                        msg_text += text;
+                    msg_part_name = msg_part.getAttribute("name")
+                    if (msg_part_name != null) {
+                        var current_token_index = msg_part_name.split("_")[1];
+                        var current_message_pos = msg_part_name.split("_")[2];
+                        if (current_token_index == token_index && current_message_pos == msg_pos) {
+                            // Use the replacement token
+                            // TODO: Don't have access to the tokenizer here, and sometimes there needs to be a space added before this token
+                            msg_text += token_string //.replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '');
+                            break;
+                        }
+                        else {
+                            // Replace here or at the end?
+                            var text = msg_part.firstChild.innerHTML.replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '')
+                            msg_text += text;
+                        }
                     }
                 }
                 else {

From 758c3f15a5a9c24a69ee3104fe06d5f5a22b3cd3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 14 Mar 2025 20:04:43 -0700
Subject: [PATCH 16/16] Lint

---
 modules/llama_cpp_python_hijack.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py
index c03c28a7..8572cd81 100644
--- a/modules/llama_cpp_python_hijack.py
+++ b/modules/llama_cpp_python_hijack.py
@@ -147,8 +147,8 @@ def monkey_patch_llama_cpp_python(lib):
                 )
 
                 # Create environment with loopcontrols extension
-                from jinja2.ext import loopcontrols
                 import jinja2
+                from jinja2.ext import loopcontrols
 
                 self._environment = jinja2.sandbox.ImmutableSandboxedEnvironment(
                     loader=jinja2.BaseLoader(),