From 877cf44c08dc98066118d1472844808b528fb778 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 25 Apr 2025 16:21:35 -0700
Subject: [PATCH] llama.cpp: Add StreamingLLM (`--streaming-llm`)

---
 modules/llama_cpp_server.py | 2 ++
 modules/shared.py           | 1 +
 modules/ui_model_menu.py    | 1 +
 3 files changed, 4 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 7199470d..7e5e3a4b 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -301,6 +301,8 @@ class LlamaServer:
                 cmd += ["--device-draft", shared.args.device_draft]
             if shared.args.ctx_size_draft > 0:
                 cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
+        if shared.args.streaming_llm:
+            cmd += ["--cache-reuse", "1"]
         if shared.args.extra_flags:
             # Clean up the input
             extra_flags = shared.args.extra_flags.strip()
diff --git a/modules/shared.py b/modules/shared.py
index 5177ac67..c40f8e21 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -128,6 +128,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
 group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
+group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
 
 # Speculative decoding
 group = parser.add_argument_group('Speculative decoding')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index f3319cfb..1460dfec 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -71,6 +71,7 @@ def create_ui():
                             shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
                             shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
+                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
                             shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                             shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')