Estimate the VRAM for GGUF models + autoset gpu-layers (#6980)

This commit is contained in:
oobabooga 2025-05-16 00:07:37 -03:00 committed by GitHub
parent c4a715fd1e
commit 5534d01da0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 193 additions and 4 deletions

View file

@ -282,8 +282,10 @@ class LlamaServer:
cmd.append("--no-kv-offload")
if shared.args.row_split:
cmd += ["--split-mode", "row"]
cache_type = "fp16"
if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
cache_type = shared.args.cache_type
if shared.args.compress_pos_emb != 1:
cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
if shared.args.rope_freq_base > 0:
@ -343,6 +345,7 @@ class LlamaServer:
print(' '.join(str(item) for item in cmd[1:]))
print()
logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}")
# Start the server with pipes for output
self.process = subprocess.Popen(
cmd,