From 877cf44c08dc98066118d1472844808b528fb778 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 25 Apr 2025 16:21:35 -0700 Subject: [PATCH] llama.cpp: Add StreamingLLM (`--streaming-llm`) --- modules/llama_cpp_server.py | 2 ++ modules/shared.py | 1 + modules/ui_model_menu.py | 1 + 3 files changed, 4 insertions(+) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 7199470d..7e5e3a4b 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -301,6 +301,8 @@ class LlamaServer: cmd += ["--device-draft", shared.args.device_draft] if shared.args.ctx_size_draft > 0: cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)] + if shared.args.streaming_llm: + cmd += ["--cache-reuse", "1"] if shared.args.extra_flags: # Clean up the input extra_flags = shared.args.extra_flags.strip() diff --git a/modules/shared.py b/modules/shared.py index 5177ac67..c40f8e21 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -128,6 +128,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.') group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"') +group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') # Speculative decoding group = parser.add_argument_group('Speculative decoding') diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index f3319cfb..1460dfec 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -71,6 +71,7 @@ def create_ui(): shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.') shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') + shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.') shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')