llama.cpp: Add StreamingLLM (--streaming-llm)

2026-02-04 06:44:18 +01:00 · 2025-04-25 16:21:35 -07:00 · 2025-04-25 16:21:35 -07:00 · 877cf44c08
parent d35818f4e1
commit 877cf44c08
3 changed files with 4 additions and 0 deletions
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -301,6 +301,8 @@ class LlamaServer:
                cmd += ["--device-draft", shared.args.device_draft]
            if shared.args.ctx_size_draft > 0:
                cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
+        if shared.args.streaming_llm:
+            cmd += ["--cache-reuse", "1"]
        if shared.args.extra_flags:
            # Clean up the input
            extra_flags = shared.args.extra_flags.strip()
--- a/modules/shared.py
+++ b/modules/shared.py
@ -128,6 +128,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
 group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
+group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')

 # Speculative decoding
 group = parser.add_argument_group('Speculative decoding')
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -71,6 +71,7 @@ def create_ui():
                            shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                            shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
+                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                            shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                            shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')