Auto-adjust GPU layers on context size and cache type changes + many fixes

This commit is contained in:
oobabooga 2025-05-16 09:07:38 -07:00
parent 93e1850a2c
commit 4925c307cf
3 changed files with 109 additions and 38 deletions

View file

@ -49,10 +49,9 @@ from modules.extensions import apply_extensions
from modules.LoRA import add_lora_to_model
from modules.models import load_model, unload_model_if_idle
from modules.models_settings import (
estimate_vram,
get_fallback_settings,
get_model_metadata,
get_nvidia_free_vram,
update_gpu_layers_and_vram,
update_model_parameters
)
from modules.shared import do_cmd_flags_warnings
@ -250,15 +249,19 @@ if __name__ == "__main__":
model_settings = get_model_metadata(model_name)
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
if 'gpu_layers' not in shared.provided_arguments:
available_vram = get_nvidia_free_vram()
if available_vram > 0:
n_layers = model_settings['gpu_layers']
tolerance = 906
while n_layers > 0 and estimate_vram(model_name, n_layers, shared.args.ctx_size, shared.args.cache_type) > available_vram - tolerance:
n_layers -= 1
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
shared.args.loader,
model_name,
model_settings['gpu_layers'],
shared.args.ctx_size,
shared.args.cache_type,
auto_adjust=True,
for_ui=False
)
shared.args.gpu_layers = n_layers
shared.args.gpu_layers = adjusted_layers
# Load the model
shared.model, shared.tokenizer = load_model(model_name)