From f4d787ab8d9e6bdb84695be347d3858fe73b6aa7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 4 Mar 2026 06:37:50 -0800 Subject: [PATCH] Delegate GPU layer allocation to llama.cpp's --fit --- modules/llama_cpp_server.py | 6 +- modules/models_settings.py | 132 ++++-------------------------------- modules/shared.py | 2 +- modules/ui_model_menu.py | 16 ++--- server.py | 15 ---- 5 files changed, 26 insertions(+), 145 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index baf474bd..4bf57ee8 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -329,7 +329,6 @@ class LlamaServer: self.server_path, "--model", self.model_path, "--ctx-size", str(shared.args.ctx_size), - "--gpu-layers", str(shared.args.gpu_layers), "--batch-size", str(shared.args.batch_size), "--ubatch-size", str(shared.args.ubatch_size), "--port", str(self.port), @@ -337,6 +336,11 @@ class LlamaServer: "--flash-attn", "on", ] + if shared.args.gpu_layers > 0: + cmd += ["--gpu-layers", str(shared.args.gpu_layers), "--fit", "off"] + else: + cmd += ["--fit", "on"] + if shared.args.threads > 0: cmd += ["--threads", str(shared.args.threads)] if shared.args.threads_batch > 0: diff --git a/modules/models_settings.py b/modules/models_settings.py index 15ff2830..3b28a800 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -1,7 +1,6 @@ import functools import json import re -import subprocess from math import floor from pathlib import Path @@ -78,7 +77,7 @@ def get_model_metadata(model): elif k.endswith('rope.scaling.factor'): model_settings['compress_pos_emb'] = metadata[k] elif k.endswith('.block_count'): - model_settings['gpu_layers'] = metadata[k] + 1 + model_settings['gpu_layers'] = 0 model_settings['max_gpu_layers'] = metadata[k] + 1 if 'tokenizer.chat_template' in metadata: @@ -265,16 +264,18 @@ def apply_model_settings_to_state(model, state): # Handle GPU layers and VRAM update for llama.cpp if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings: - vram_info, gpu_layers_update = update_gpu_layers_and_vram( + gpu_layers = model_settings['gpu_layers'] # 0 (auto) by default, or user-saved value + max_layers = model_settings.get('max_gpu_layers', 256) + state['gpu_layers'] = gr.update(value=gpu_layers, maximum=max_layers) + + vram_info = update_gpu_layers_and_vram( state['loader'], model, - model_settings['gpu_layers'], + gpu_layers, state['ctx_size'], state['cache_type'], - auto_adjust=True ) - state['gpu_layers'] = gpu_layers_update state['vram_info'] = vram_info return state @@ -412,120 +413,13 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): return vram -def get_nvidia_vram(return_free=True): +def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type): """ - Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output. - - Args: - return_free (bool): If True, returns free VRAM. If False, returns total VRAM. - - Returns: - int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs. - Returns -1 if nvidia-smi command fails (not found, error, etc.). - Returns 0 if nvidia-smi succeeds but no GPU memory info found. - """ - try: - # Execute nvidia-smi command - result = subprocess.run( - ['nvidia-smi'], - capture_output=True, - text=True, - check=False - ) - - # Check if nvidia-smi returned an error - if result.returncode != 0: - return -1 - - # Parse the output for memory usage patterns - output = result.stdout - - # Find memory usage like "XXXXMiB / YYYYMiB" - # Captures used and total memory for each GPU - matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output) - - if not matches: - # No GPUs found in expected format - return 0 - - total_vram_mib = 0 - total_free_vram_mib = 0 - - for used_mem_str, total_mem_str in matches: - try: - used_mib = int(used_mem_str) - total_mib = int(total_mem_str) - total_vram_mib += total_mib - total_free_vram_mib += (total_mib - used_mib) - except ValueError: - # Skip malformed entries - pass - - # Return either free or total VRAM based on the flag - return total_free_vram_mib if return_free else total_vram_mib - - except FileNotFoundError: - # nvidia-smi not found (likely no NVIDIA drivers installed) - return -1 - except Exception: - # Handle any other unexpected exceptions - return -1 - - -def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True): - """ - Unified function to handle GPU layers and VRAM updates. - - Args: - for_ui: If True, returns Gradio updates. If False, returns raw values. - - Returns: - - If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update - - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage + Compute the estimated VRAM usage for the given GPU layers and return + an HTML string for the UI display. """ if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"): - vram_info = "
Estimated VRAM to load the model:
" - if for_ui: - return (vram_info, gr.update()) if auto_adjust else vram_info - else: - return (0, gpu_layers) if auto_adjust else 0 + return "
Estimated VRAM to load the model:
" - # Get model settings including user preferences - model_settings = get_model_metadata(model) - - current_layers = gpu_layers - max_layers = model_settings.get('max_gpu_layers', 256) - - if auto_adjust: - # Check if this is a user-saved setting - user_config = shared.user_config - model_regex = Path(model).name + '$' - has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex] - - if not has_user_setting: - # No user setting, auto-adjust from the maximum - current_layers = max_layers # Start from max - - # Auto-adjust based on available/total VRAM - # If a model is loaded and it's for the UI, use the total VRAM to avoid confusion - return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True - available_vram = get_nvidia_vram(return_free=return_free) - if available_vram > 0: - tolerance = 577 - while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: - current_layers -= 1 - - # Calculate VRAM with current layers - vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type) - - if for_ui: - vram_info = f"
Estimated VRAM to load the model: {vram_usage:.0f} MiB
" - if auto_adjust: - return vram_info, gr.update(value=current_layers, maximum=max_layers) - else: - return vram_info - else: - if auto_adjust: - return vram_usage, current_layers - else: - return vram_usage + vram_usage = estimate_vram(model, gpu_layers, ctx_size, cache_type) + return f"
Estimated VRAM to load the model: {vram_usage:.0f} MiB
" diff --git a/modules/shared.py b/modules/shared.py index 88e4b182..2fba530b 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -84,7 +84,7 @@ group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the pr # llama.cpp group = parser.add_argument_group('llama.cpp') -group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.') +group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=0, metavar='N', help='Number of layers to offload to the GPU. 0 means auto (llama.cpp decides via --fit).') group.add_argument('--cpu-moe', action='store_true', help='Move the experts to the CPU (for MoE models).') group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.') group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 86adc229..b9d731ee 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -41,7 +41,7 @@ def create_ui(): gr.Markdown("## Main options") with gr.Row(): with gr.Column(): - shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='0 = auto (llama.cpp decides via --fit). Set manually to override.') shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.') @@ -157,22 +157,22 @@ def create_event_handlers(): handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False) shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then( - partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False) + update_gpu_layers_and_vram, gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) shared.gradio['save_model_settings'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False) - # For ctx_size and cache_type - auto-adjust GPU layers + # For ctx_size and cache_type - update VRAM display for param in ['ctx_size', 'cache_type']: shared.gradio[param].change( - partial(update_gpu_layers_and_vram, auto_adjust=True), + update_gpu_layers_and_vram, gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), - gradio('vram_info', 'gpu_layers'), show_progress=False) + gradio('vram_info'), show_progress=False) # For manual gpu_layers changes - only update VRAM shared.gradio['gpu_layers'].change( - partial(update_gpu_layers_and_vram, auto_adjust=False), + update_gpu_layers_and_vram, gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) @@ -386,8 +386,6 @@ def get_initial_vram_info(): shared.args.gpu_layers, shared.args.ctx_size, shared.args.cache_type, - auto_adjust=False, - for_ui=True ) return "
Estimated VRAM to load the model:
" @@ -396,7 +394,7 @@ def get_initial_vram_info(): def get_initial_gpu_layers_max(): if shared.model_name != 'None' and shared.args.loader == 'llama.cpp': model_settings = get_model_metadata(shared.model_name) - return model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', 256)) + return model_settings.get('max_gpu_layers', 256) return 256 diff --git a/server.py b/server.py index 25776f2d..304a0478 100644 --- a/server.py +++ b/server.py @@ -62,7 +62,6 @@ from modules.models import load_model, unload_model_if_idle from modules.models_settings import ( get_fallback_settings, get_model_metadata, - update_gpu_layers_and_vram, update_model_parameters ) from modules.shared import do_cmd_flags_warnings @@ -315,20 +314,6 @@ if __name__ == "__main__": model_settings = get_model_metadata(shared.model_name) update_model_parameters(model_settings, initial=True) # hijack the command-line arguments - # Auto-adjust GPU layers if not provided by user and it's a llama.cpp model - if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings: - vram_usage, adjusted_layers = update_gpu_layers_and_vram( - shared.args.loader, - shared.model_name, - model_settings['gpu_layers'], - shared.args.ctx_size, - shared.args.cache_type, - auto_adjust=True, - for_ui=False - ) - - shared.args.gpu_layers = adjusted_layers - # Load the model shared.model, shared.tokenizer = load_model(shared.model_name) if shared.args.lora: