From afadc787d71e8f0e9c37a0afbdf804f9864d66ec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Mar 2024 20:09:34 -0700
Subject: [PATCH 01/12] Optimize the UI by caching convert_to_markdown calls

---
 modules/html_generator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index e3dd453e..2b125c2c 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -2,6 +2,7 @@ import html
 import os
 import re
 import time
+import functools
 from pathlib import Path
 
 import markdown
@@ -47,6 +48,7 @@ def replace_blockquote(m):
     return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
 
 
+@functools.lru_cache(maxsize=512)
 def convert_to_markdown(string):
 
     # Blockquote

From 9eca19740955da3adb1a0bcbc3fd7b01fbaac5e8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Mar 2024 16:31:13 -0700
Subject: [PATCH 02/12] Minor logging change

---
 modules/cache_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/cache_utils.py b/modules/cache_utils.py
index 3f5a0f31..c235d0ca 100644
--- a/modules/cache_utils.py
+++ b/modules/cache_utils.py
@@ -30,8 +30,8 @@ def process_llamacpp_cache(model, new_sequence, past_sequence):
         overlapping_sequence = new_sequence[j1:j2 + 1]
         added_chunk = new_sequence[j2 + 1:]
 
-        # print(past_sequence)
-        # print(new_sequence)
+        # print(past_sequence.tolist())
+        # print(new_sequence.tolist())
 
         print()
         print('MATCHING PREFIX=', repr(shared.tokenizer.decode(matching_prefix)))

From 46031407b599e27b89b181f1a302941e41dd6d88 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Mar 2024 18:43:04 -0700
Subject: [PATCH 03/12] Increase the cache size of convert_to_markdown to 4096

---
 modules/html_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 2b125c2c..86771dcf 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -48,7 +48,7 @@ def replace_blockquote(m):
     return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
 
 
-@functools.lru_cache(maxsize=512)
+@functools.lru_cache(maxsize=4096)
 def convert_to_markdown(string):
 
     # Blockquote

From 63701f59cf75fb5b939a8580dd959ef4f722d1ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Mar 2024 18:54:15 -0700
Subject: [PATCH 04/12] UI: mention that n_gpu_layers > 0 is necessary for the
 GPU to be used

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index d268770a..482d1e63 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -90,7 +90,7 @@ def create_ui():
                                 shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
 
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers)
+                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='This must be set to more than 0 for your GPU to be used.')
                             shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.')
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 18,17')
                             shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)

From 28076928ac9ef9ee4846b2a65e698520222da7cb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Mar 2024 23:41:57 -0300
Subject: [PATCH 05/12] UI: Add a new "User description" field for user
 personality/biography (#5691)

---
 extensions/openai/typing.py |  3 +-
 modules/chat.py             | 13 +++++--
 modules/shared.py           |  3 +-
 modules/ui.py               |  1 +
 modules/ui_chat.py          | 78 +++++++++++++++++++------------------
 settings-template.yaml      |  7 +++-
 6 files changed, 62 insertions(+), 43 deletions(-)

diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index 3ae02e68..af7b094f 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -103,10 +103,11 @@ class ChatCompletionRequestParams(BaseModel):
     instruction_template_str: str | None = Field(default=None, description="A Jinja2 instruction template. If set, will take precedence over everything else.")
 
     character: str | None = Field(default=None, description="A character defined under text-generation-webui/characters. If not set, the default \"Assistant\" character will be used.")
-    user_name: str | None = Field(default=None, description="Your name (the user). By default, it's \"You\".", alias="name1")
     bot_name: str | None = Field(default=None, description="Overwrites the value set by character field.", alias="name2")
     context: str | None = Field(default=None, description="Overwrites the value set by character field.")
     greeting: str | None = Field(default=None, description="Overwrites the value set by character field.")
+    user_name: str | None = Field(default=None, description="Your name (the user). By default, it's \"You\".", alias="name1")
+    user_bio: str | None = Field(default=None, description="The user description/personality.")
     chat_template_str: str | None = Field(default=None, description="Jinja2 template for chat.")
 
     chat_instruct_command: str | None = None
diff --git a/modules/chat.py b/modules/chat.py
index a1fcb6b0..c8516c59 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -86,10 +86,16 @@ def generate_chat_prompt(user_input, state, **kwargs):
     if state['mode'] != 'instruct':
         chat_template_str = replace_character_names(chat_template_str, state['name1'], state['name2'])
 
-    chat_template = jinja_env.from_string(chat_template_str)
     instruction_template = jinja_env.from_string(state['instruction_template_str'])
-    chat_renderer = partial(chat_template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2'])
     instruct_renderer = partial(instruction_template.render, add_generation_prompt=False)
+    chat_template = jinja_env.from_string(chat_template_str)
+    chat_renderer = partial(
+        chat_template.render,
+        add_generation_prompt=False,
+        name1=state['name1'],
+        name2=state['name2'],
+        user_bio=replace_character_names(state['user_bio'], state['name1'], state['name2']),
+    )
 
     messages = []
 
@@ -99,7 +105,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
             messages.append({"role": "system", "content": state['custom_system_message']})
     else:
         renderer = chat_renderer
-        if state['context'].strip() != '':
+        if state['context'].strip() != '' or state['user_bio'].strip() != '':
             context = replace_character_names(state['context'], state['name1'], state['name2'])
             messages.append({"role": "system", "content": context})
 
@@ -140,6 +146,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
             command = state['chat-instruct_command']
             command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1'])
             command = command.replace('<|prompt|>', prompt)
+            command = replace_character_names(command, state['name1'], state['name2'])
 
             if _continue:
                 prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0]
diff --git a/modules/shared.py b/modules/shared.py
index 69ad0cfd..c2a44eb8 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -57,9 +57,10 @@ settings = {
     'stream': True,
     'character': 'Assistant',
     'name1': 'You',
+    'user_bio': '',
     'custom_system_message': '',
     'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n    {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {{- '' + message['content'] + '\\n\\n' -}}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n        {%- else -%}\n            {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{-'### Response:\\n'-}}\n{%- endif -%}",
-    'chat_template_str': "{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {{- message['content'] + '\\n\\n' -}}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{- name1 + ': ' + message['content'] + '\\n'-}}\n        {%- else -%}\n            {{- name2 + ': ' + message['content'] + '\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}",
+    'chat_template_str': "{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {%- if message['content'] -%}\n            {{- message['content'] + '\\n\\n' -}}\n        {%- endif -%}\n        {%- if user_bio -%}\n            {{- user_bio + '\\n\\n' -}}\n        {%- endif -%}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{- name1 + ': ' + message['content'] + '\\n'-}}\n        {%- else -%}\n            {{- name2 + ': ' + message['content'] + '\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}",
     'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
     'autoload_model': False,
     'gallery-items_per_page': 50,
diff --git a/modules/ui.py b/modules/ui.py
index 4a03f843..ad1d5df4 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -170,6 +170,7 @@ def list_interface_input_elements():
         'character_menu',
         'history',
         'name1',
+        'user_bio',
         'name2',
         'greeting',
         'context',
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 7255bb99..263ff1e5 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -94,19 +94,50 @@ def create_ui():
 
 def create_chat_settings_ui():
     mu = shared.args.multi_user
-    with gr.Tab('Character'):
+    with gr.Tab('Chat'):
         with gr.Row():
             with gr.Column(scale=8):
-                with gr.Row():
-                    shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
-                    ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu)
-                    shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
-                    shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
+                with gr.Tab("Character"):
+                    with gr.Row():
+                        shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
+                        ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu)
+                        shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
+                        shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
 
-                shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Your name')
-                shared.gradio['name2'] = gr.Textbox(value='', lines=1, label='Character\'s name')
-                shared.gradio['context'] = gr.Textbox(value='', lines=10, label='Context', elem_classes=['add_scrollbar'])
-                shared.gradio['greeting'] = gr.Textbox(value='', lines=5, label='Greeting', elem_classes=['add_scrollbar'])
+                    shared.gradio['name2'] = gr.Textbox(value='', lines=1, label='Character\'s name')
+                    shared.gradio['context'] = gr.Textbox(value='', lines=10, label='Context', elem_classes=['add_scrollbar'])
+                    shared.gradio['greeting'] = gr.Textbox(value='', lines=5, label='Greeting', elem_classes=['add_scrollbar'])
+
+                with gr.Tab("User"):
+                    shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Name')
+                    shared.gradio['user_bio'] = gr.Textbox(value='', lines=5, label='Description', info='Here you can optionally write a description of yourself.', placeholder='{{user}}\'s personality: ...', elem_classes=['add_scrollbar'])
+
+                with gr.Tab('Chat history'):
+                    with gr.Row():
+                        with gr.Column():
+                            shared.gradio['save_chat_history'] = gr.Button(value='Save history')
+
+                        with gr.Column():
+                            shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label='Upload History JSON')
+
+                with gr.Tab('Upload character'):
+                    with gr.Tab('YAML or JSON'):
+                        with gr.Row():
+                            shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label='JSON or YAML File', interactive=not mu)
+                            shared.gradio['upload_img_bot'] = gr.Image(type='pil', label='Profile Picture (optional)', interactive=not mu)
+
+                        shared.gradio['Submit character'] = gr.Button(value='Submit', interactive=False)
+
+                    with gr.Tab('TavernAI PNG'):
+                        with gr.Row():
+                            with gr.Column():
+                                shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id='upload_img_tavern', interactive=not mu)
+                                shared.gradio['tavern_json'] = gr.State()
+                            with gr.Column():
+                                shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
+                                shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=4, max_lines=4, label='Description', interactive=False)
+
+                        shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False)
 
             with gr.Column(scale=1):
                 shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil', interactive=not mu)
@@ -137,33 +168,6 @@ def create_chat_settings_ui():
             with gr.Column():
                 shared.gradio['chat_template_str'] = gr.Textbox(value=shared.settings['chat_template_str'], label='Chat template', lines=22, elem_classes=['add_scrollbar', 'monospace'])
 
-    with gr.Tab('Chat history'):
-        with gr.Row():
-            with gr.Column():
-                shared.gradio['save_chat_history'] = gr.Button(value='Save history')
-
-            with gr.Column():
-                shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label='Upload History JSON')
-
-    with gr.Tab('Upload character'):
-        with gr.Tab('YAML or JSON'):
-            with gr.Row():
-                shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label='JSON or YAML File', interactive=not mu)
-                shared.gradio['upload_img_bot'] = gr.Image(type='pil', label='Profile Picture (optional)', interactive=not mu)
-
-            shared.gradio['Submit character'] = gr.Button(value='Submit', interactive=False)
-
-        with gr.Tab('TavernAI PNG'):
-            with gr.Row():
-                with gr.Column():
-                    shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id='upload_img_tavern', interactive=not mu)
-                    shared.gradio['tavern_json'] = gr.State()
-                with gr.Column():
-                    shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
-                    shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=4, max_lines=4, label='Description', interactive=False)
-
-            shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False)
-
 
 def create_event_handlers():
 
diff --git a/settings-template.yaml b/settings-template.yaml
index 87101116..bf057be7 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -54,7 +54,12 @@ instruction_template_str: |-
 chat_template_str: |-
   {%- for message in messages %}
       {%- if message['role'] == 'system' -%}
-          {{- message['content'] + '\n\n' -}}
+          {%- if message['content'] -%}
+              {{- message['content'] + '\n\n' -}}
+          {%- endif -%}
+          {%- if user_bio -%}
+              {{- user_bio + '\n\n' -}}
+          {%- endif -%}
       {%- else -%}
           {%- if message['role'] == 'user' -%}
               {{- name1 + ': ' + message['content'] + '\n'-}}

From 8152152dd66f6041bb59e7cad09fd2e64f040f64 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 11 Mar 2024 19:49:22 -0700
Subject: [PATCH 06/12] Small fix after
 28076928ac9ef9ee4846b2a65e698520222da7cb

---
 modules/ui_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 263ff1e5..293d253e 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -110,7 +110,7 @@ def create_chat_settings_ui():
 
                 with gr.Tab("User"):
                     shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Name')
-                    shared.gradio['user_bio'] = gr.Textbox(value='', lines=5, label='Description', info='Here you can optionally write a description of yourself.', placeholder='{{user}}\'s personality: ...', elem_classes=['add_scrollbar'])
+                    shared.gradio['user_bio'] = gr.Textbox(value=shared.settings['user_bio'], lines=10, label='Description', info='Here you can optionally write a description of yourself.', placeholder='{{user}}\'s personality: ...', elem_classes=['add_scrollbar'])
 
                 with gr.Tab('Chat history'):
                     with gr.Row():

From edec3bf3b0dadaf78342479659e0c7711eab347b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 13 Mar 2024 08:14:34 -0700
Subject: [PATCH 07/12] UI: avoid caching convert_to_markdown calls during
 streaming

---
 modules/html_generator.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 86771dcf..902ffcc7 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -101,6 +101,17 @@ def convert_to_markdown(string):
     return html_output
 
 
+def convert_to_markdown_wrapped(string, use_cache=True):
+    '''
+    Used to avoid caching convert_to_markdown calls during streaming.
+    '''
+
+    if use_cache:
+        return convert_to_markdown(string)
+
+    return convert_to_markdown.__wrapped__(string)
+
+
 def generate_basic_html(string):
     string = convert_to_markdown(string)
     string = f'<style>{readable_css}</style><div class="readable-container">{string}</div>'
@@ -196,7 +207,7 @@ def get_image_cache(path):
 def generate_instruct_html(history):
     output = f'<style>{instruct_css}</style><div class="chat" id="chat"><div class="messages">'
     for i, _row in enumerate(history):
-        row = [convert_to_markdown(entry) for entry in _row]
+        row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
 
         if row[0]:  # don't display empty user messages
             output += f"""
@@ -232,7 +243,7 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
     img_me = f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">' if Path("cache/pfp_me.png").exists() else ''
 
     for i, _row in enumerate(history):
-        row = [convert_to_markdown(entry) for entry in _row]
+        row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
 
         if row[0]:  # don't display empty user messages
             output += f"""
@@ -275,7 +286,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
     output = f'<style>{chat_styles["wpp"]}</style><div class="chat" id="chat"><div class="messages">'
 
     for i, _row in enumerate(history):
-        row = [convert_to_markdown(entry) for entry in _row]
+        row = [convert_to_markdown_wrapped(entry, use_cache=i != len(history) - 1) for entry in _row]
 
         if row[0]:  # don't display empty user messages
             output += f"""

From 40a60e0297ad1a032019b4b6b01529ded71b1dfe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 13 Mar 2024 08:15:49 -0700
Subject: [PATCH 08/12] Convert attention_sink_size to int (closes #5696)

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 482d1e63..0e86a9f0 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -118,7 +118,7 @@ def create_ui():
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                             shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
-                            shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
+                            shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
                             shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
                             shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
                             shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')

From 2ef5490a36bc824325eeee5b4356591e76ffcc54 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 13 Mar 2024 08:18:49 -0700
Subject: [PATCH 09/12] UI: make light theme less blinding

---
 modules/ui.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/ui.py b/modules/ui.py
index ad1d5df4..0e9ebe02 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -35,7 +35,8 @@ theme = gr.themes.Default(
     border_color_primary='#c5c5d2',
     button_large_padding='6px 12px',
     body_text_color_subdued='#484848',
-    background_fill_secondary='#eaeaea'
+    background_fill_secondary='#eaeaea',
+    background_fill_primary='#fafafa',
 )
 
 if Path("notification.mp3").exists():

From d828844a6f4727827b64d6c6ed03f965af089c8f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 14 Mar 2024 08:56:28 -0700
Subject: [PATCH 10/12] Small fix: don't save truncation_length to
 settings.yaml

It should derive from model metadata or from a command-line flag.
---
 modules/ui.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui.py b/modules/ui.py
index 0e9ebe02..f973fa6f 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -222,7 +222,7 @@ def apply_interface_values(state, use_persistent=False):
 
 def save_settings(state, preset, extensions_list, show_controls, theme_state):
     output = copy.deepcopy(shared.settings)
-    exclude = ['name2', 'greeting', 'context', 'turn_template']
+    exclude = ['name2', 'greeting', 'context', 'turn_template', 'truncation_length']
     for k in state:
         if k in shared.settings and k not in exclude:
             output[k] = state[k]

From d890c99b53343e2f5f08407b2b160fe44e094917 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 14 Mar 2024 09:18:54 -0700
Subject: [PATCH 11/12] Fix StreamingLLM when content is removed from the
 beginning of the prompt

---
 modules/cache_utils.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/modules/cache_utils.py b/modules/cache_utils.py
index c235d0ca..0d1368a2 100644
--- a/modules/cache_utils.py
+++ b/modules/cache_utils.py
@@ -19,12 +19,12 @@ def process_llamacpp_cache(model, new_sequence, past_sequence):
         past_sequence = torch.tensor(past_sequence)
 
         prefix_length = find_prefix_length(past_sequence[:i1], new_sequence[:j1])
-        sink_length = prefix_length
-        if sink_length < shared.args.attention_sink_size:
-            sink_length = shared.args.attention_sink_size
-
+        sink_length = max(prefix_length, shared.args.attention_sink_size)
         removed_length = i1 - sink_length
 
+        if removed_length <= 0:
+            return past_sequence.tolist()
+
         matching_prefix = past_sequence[:prefix_length]
         removed_chunk = past_sequence[sink_length:i1]
         overlapping_sequence = new_sequence[j1:j2 + 1]
@@ -37,10 +37,11 @@ def process_llamacpp_cache(model, new_sequence, past_sequence):
         print('MATCHING PREFIX=', repr(shared.tokenizer.decode(matching_prefix)))
         print('ADDED CHUNK=', repr(shared.tokenizer.decode(added_chunk)))
         print('REMOVED CHUNK=', repr(shared.tokenizer.decode(removed_chunk)))
+        print('REMOVED LENGTH=', removed_length)
         print()
 
         # Remove interval [sink_length, sink_length + removed_length) from the context
-        # Subtract removed_length from model.n_tokens
+        # Update model.n_tokens
         model._ctx.kv_cache_seq_rm(0, sink_length, sink_length + removed_length)
         model._ctx.kv_cache_seq_shift(0, sink_length + removed_length, -1, -removed_length)
 

From 49b111e2dd2e5c4f1e6f2a25df38c3a1b1dbf4d7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Mar 2024 08:29:03 -0700
Subject: [PATCH 12/12] Lint

---
 modules/html_generator.py | 2 +-
 modules/ui_model_menu.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 902ffcc7..278f1632 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -1,8 +1,8 @@
+import functools
 import html
 import os
 import re
 import time
-import functools
 from pathlib import Path
 
 import markdown
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 0e86a9f0..a31bbcf5 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -90,7 +90,7 @@ def create_ui():
                                 shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
 
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='This must be set to more than 0 for your GPU to be used.')
+                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be set to more than 0 for your GPU to be used.')
                             shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.')
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 18,17')
                             shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)