Delegate GPU layer allocation to llama.cpp's --fit

2026-04-05 14:45:28 +00:00 · 2026-03-04 06:37:50 -08:00 · 2026-03-04 06:37:50 -08:00 · f4d787ab8d
commit f4d787ab8d
parent 8a3d866401
5 changed files with 26 additions and 145 deletions
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -329,7 +329,6 @@ class LlamaServer:
            self.server_path,
            "--model", self.model_path,
            "--ctx-size", str(shared.args.ctx_size),
-            "--gpu-layers", str(shared.args.gpu_layers),
            "--batch-size", str(shared.args.batch_size),
            "--ubatch-size", str(shared.args.ubatch_size),
            "--port", str(self.port),
@ -337,6 +336,11 @@ class LlamaServer:
            "--flash-attn", "on",
        ]

+        if shared.args.gpu_layers > 0:
+            cmd += ["--gpu-layers", str(shared.args.gpu_layers), "--fit", "off"]
+        else:
+            cmd += ["--fit", "on"]
+
        if shared.args.threads > 0:
            cmd += ["--threads", str(shared.args.threads)]
        if shared.args.threads_batch > 0: