From 0f77ff9670364d605ad40bd1addd153213ce65f0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 19:19:22 -0700 Subject: [PATCH 1/3] UI: Use total VRAM (not free) for layers calculation when a model is loaded --- modules/models_settings.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 3a2400d4..6b9493ca 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -379,12 +379,15 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): return vram -def get_nvidia_free_vram(): +def get_nvidia_vram(return_free=True): """ - Calculates the total free VRAM across all NVIDIA GPUs by parsing nvidia-smi output. + Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output. + + Args: + return_free (bool): If True, returns free VRAM. If False, returns total VRAM. Returns: - int: The total free VRAM in MiB summed across all detected NVIDIA GPUs. + int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs. Returns -1 if nvidia-smi command fails (not found, error, etc.). Returns 0 if nvidia-smi succeeds but no GPU memory info found. """ @@ -412,17 +415,21 @@ def get_nvidia_free_vram(): # No GPUs found in expected format return 0 + total_vram_mib = 0 total_free_vram_mib = 0 + for used_mem_str, total_mem_str in matches: try: used_mib = int(used_mem_str) total_mib = int(total_mem_str) + total_vram_mib += total_mib total_free_vram_mib += (total_mib - used_mib) except ValueError: # Skip malformed entries pass - return total_free_vram_mib + # Return either free or total VRAM based on the flag + return total_free_vram_mib if return_free else total_vram_mib except FileNotFoundError: # nvidia-smi not found (likely no NVIDIA drivers installed) @@ -473,8 +480,10 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, # No user setting, auto-adjust from the maximum current_layers = max_layers # Start from max - # Auto-adjust based on available VRAM - available_vram = get_nvidia_free_vram() + # Auto-adjust based on available/total VRAM + # If a model is loaded and it's for the UI, use the total VRAM to avoid confusion + return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True + available_vram = get_nvidia_vram(return_free=return_free) if available_vram > 0: tolerance = 906 while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: From 052c82b664faf109d2ded6c0250ad777259adb94 Mon Sep 17 00:00:00 2001 From: mamei16 Date: Sat, 17 May 2025 16:19:13 +0200 Subject: [PATCH 2/3] Fix KeyError: 'gpu_layers' when loading existing model settings (#6991) --- modules/models_settings.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 6b9493ca..10234b4b 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -154,10 +154,11 @@ def get_model_metadata(model): for pat in settings: if re.match(pat.lower(), Path(model).name.lower()): for k in settings[pat]: + new_k = k if k == 'n_gpu_layers': - k = 'gpu_layers' + new_k = 'gpu_layers' - model_settings[k] = settings[pat][k] + model_settings[new_k] = settings[pat][k] # Load instruction template if defined by name rather than by value if model_settings['instruction_template'] != 'Custom (obtained from model metadata)': From 4800d1d522f84efaa50f4222aefd6fcae7e19e0c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 22:33:32 -0700 Subject: [PATCH 3/3] More robust VRAM calculation --- modules/models_settings.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 6b9493ca..81a7a00e 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -2,7 +2,7 @@ import functools import json import re import subprocess -from math import exp +from math import floor from pathlib import Path import gradio as gr @@ -331,8 +331,6 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): n_layers = None n_kv_heads = None embedding_dim = None - context_length = None - feed_forward_dim = None for key, value in metadata.items(): if key.endswith('.block_count'): @@ -341,10 +339,6 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): n_kv_heads = value elif key.endswith('.embedding_length'): embedding_dim = value - elif key.endswith('.context_length'): - context_length = value - elif key.endswith('.feed_forward_length'): - feed_forward_dim = value if gpu_layers > n_layers: gpu_layers = n_layers @@ -359,22 +353,16 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): # Derived features size_per_layer = size_in_mb / max(n_layers, 1e-6) - context_per_layer = context_length / max(n_layers, 1e-6) - ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6) kv_cache_factor = n_kv_heads * cache_type * ctx_size - - # Helper function for smaller - def smaller(x, y): - return 1 if x < y else 0 + embedding_per_context = embedding_dim / ctx_size # Calculate VRAM using the model # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/ vram = ( - (size_per_layer - 21.19195204848197) - * exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845)) - + 0.0006621544775632052 * context_per_layer - + 3.34664386576376e-05 * kv_cache_factor - ) * (1.363306170123392 + gpu_layers) + 1255.163594536052 + (size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor) + * (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632))) + + 1516.522943869404 + ) return vram @@ -485,7 +473,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True available_vram = get_nvidia_vram(return_free=return_free) if available_vram > 0: - tolerance = 906 + tolerance = 577 while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: current_layers -= 1