Delegate GPU layer allocation to llama.cpp's --fit

This commit is contained in:
oobabooga 2026-03-04 06:37:50 -08:00
parent 8a3d866401
commit f4d787ab8d
5 changed files with 26 additions and 145 deletions

View file

@ -329,7 +329,6 @@ class LlamaServer:
self.server_path,
"--model", self.model_path,
"--ctx-size", str(shared.args.ctx_size),
"--gpu-layers", str(shared.args.gpu_layers),
"--batch-size", str(shared.args.batch_size),
"--ubatch-size", str(shared.args.ubatch_size),
"--port", str(self.port),
@ -337,6 +336,11 @@ class LlamaServer:
"--flash-attn", "on",
]
if shared.args.gpu_layers > 0:
cmd += ["--gpu-layers", str(shared.args.gpu_layers), "--fit", "off"]
else:
cmd += ["--fit", "on"]
if shared.args.threads > 0:
cmd += ["--threads", str(shared.args.threads)]
if shared.args.threads_batch > 0: