llama.cpp: Add StreamingLLM (--streaming-llm)

This commit is contained in:
oobabooga 2025-04-25 16:21:35 -07:00
parent d35818f4e1
commit 877cf44c08
3 changed files with 4 additions and 0 deletions

View file

@ -301,6 +301,8 @@ class LlamaServer:
cmd += ["--device-draft", shared.args.device_draft]
if shared.args.ctx_size_draft > 0:
cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
if shared.args.streaming_llm:
cmd += ["--cache-reuse", "1"]
if shared.args.extra_flags:
# Clean up the input
extra_flags = shared.args.extra_flags.strip()

View file

@ -128,6 +128,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
# Speculative decoding
group = parser.add_argument_group('Speculative decoding')

View file

@ -71,6 +71,7 @@ def create_ui():
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')