From 9119ce0680f3e03fb1e726b2942606ff37d01bd0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 15 Mar 2026 09:22:38 -0700 Subject: [PATCH] llama.cpp: Use `--fit-ctx 8192` when `--fit on` is used This sets the minimum acceptable context length, which by default is 4096. --- modules/llama_cpp_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 05c07748..c3a8d105 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -378,6 +378,7 @@ class LlamaServer: cmd += ["--gpu-layers", str(shared.args.gpu_layers), "--fit", "off"] else: cmd += ["--fit", "on"] + cmd += ["--fit-ctx", "8192"] if shared.args.fit_target: cmd += ["--fit-target", shared.args.fit_target]