diff --git a/css/main.css b/css/main.css index 30089aca..0902b184 100644 --- a/css/main.css +++ b/css/main.css @@ -569,7 +569,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .dark .message-body :not(pre) > code { - background-color: rgb(255 255 255 / 12.5%); + background-color: rgb(255 255 255 / 10%); } #chat-input { @@ -1386,3 +1386,15 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { strong { font-weight: bold; } + +.min.svelte-1ybaih5 { + min-height: 0; +} + +#vram-info .value { + color: #008d00; +} + +.dark #vram-info .value { + color: #07ff07; +} diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 1046969a..3fc7a0cc 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -282,8 +282,10 @@ class LlamaServer: cmd.append("--no-kv-offload") if shared.args.row_split: cmd += ["--split-mode", "row"] + cache_type = "fp16" if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types: cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type] + cache_type = shared.args.cache_type if shared.args.compress_pos_emb != 1: cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)] if shared.args.rope_freq_base > 0: @@ -343,6 +345,7 @@ class LlamaServer: print(' '.join(str(item) for item in cmd[1:])) print() + logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}") # Start the server with pipes for output self.process = subprocess.Popen( cmd, diff --git a/modules/models.py b/modules/models.py index d0b0402a..9ecee803 100644 --- a/modules/models.py +++ b/modules/models.py @@ -71,7 +71,6 @@ def llama_cpp_server_loader(model_name): else: model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] - logger.info(f"llama.cpp weights detected: \"{model_file}\"") try: model = LlamaServer(model_file) return model, model diff --git a/modules/models_settings.py b/modules/models_settings.py index 4418e3fb..a8e17594 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -1,7 +1,11 @@ +import functools import json import re +import subprocess +from math import exp from pathlib import Path +import gradio as gr import yaml from modules import chat, loaders, metadata_gguf, shared, ui @@ -216,7 +220,17 @@ def apply_model_settings_to_state(model, state): for k in model_settings: if k in state: - state[k] = model_settings[k] + if k == 'gpu_layers': + available_vram = get_nvidia_free_vram() + n_layers = model_settings[k] + if available_vram > 0: + tolerance = 906 + while n_layers > 0 and estimate_vram(model, n_layers, state['ctx_size'], state['cache_type']) > available_vram - tolerance: + n_layers -= 1 + + state[k] = gr.update(value=n_layers, maximum=model_settings[k]) + else: + state[k] = model_settings[k] return state @@ -277,3 +291,138 @@ def save_instruction_template(model, template): yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.") else: yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.") + + +@functools.lru_cache(maxsize=None) +def get_gguf_metadata_cached(model_file): + return metadata_gguf.load_metadata(model_file) + + +def get_model_size_mb(model_file: Path) -> float: + filename = model_file.name + + # Check for multipart pattern + match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename) + + if match: + # It's a multipart file, find all matching parts + base_pattern = match.group(1) + part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf')) + total_size = sum(p.stat().st_size for p in part_files) + else: + # Single part + total_size = model_file.stat().st_size + + return total_size / (1024 ** 2) # Return size in MB + + +def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): + model_file = Path(f'{shared.args.model_dir}/{gguf_file}') + metadata = get_gguf_metadata_cached(model_file) + size_in_mb = get_model_size_mb(model_file) + + # Extract values from metadata + n_layers = None + n_kv_heads = None + embedding_dim = None + context_length = None + feed_forward_dim = None + + for key, value in metadata.items(): + if key.endswith('.block_count'): + n_layers = value + elif key.endswith('.attention.head_count_kv'): + n_kv_heads = value + elif key.endswith('.embedding_length'): + embedding_dim = value + elif key.endswith('.context_length'): + context_length = value + elif key.endswith('.feed_forward_length'): + feed_forward_dim = value + + if gpu_layers > n_layers: + gpu_layers = n_layers + + # Convert cache_type to numeric + if cache_type == 'q4_0': + cache_type = 4 + elif cache_type == 'q8_0': + cache_type = 8 + else: + cache_type = 16 + + # Derived features + size_per_layer = size_in_mb / max(n_layers, 1e-6) + context_per_layer = context_length / max(n_layers, 1e-6) + ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6) + kv_cache_factor = n_kv_heads * cache_type * ctx_size + + # Helper function for smaller + def smaller(x, y): + return 1 if x < y else 0 + + # Calculate VRAM using the model + # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/ + vram = ( + (size_per_layer - 21.19195204848197) + * exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845)) + + 0.0006621544775632052 * context_per_layer + + 3.34664386576376e-05 * kv_cache_factor + ) * (1.363306170123392 + gpu_layers) + 1255.163594536052 + + return vram + + +def get_nvidia_free_vram(): + """ + Calculates the total free VRAM across all NVIDIA GPUs by parsing nvidia-smi output. + + Returns: + int: The total free VRAM in MiB summed across all detected NVIDIA GPUs. + Returns -1 if nvidia-smi command fails (not found, error, etc.). + Returns 0 if nvidia-smi succeeds but no GPU memory info found. + """ + try: + # Execute nvidia-smi command + result = subprocess.run( + ['nvidia-smi'], + capture_output=True, + text=True, + check=False + ) + + # Check if nvidia-smi returned an error + if result.returncode != 0: + return -1 + + # Parse the output for memory usage patterns + output = result.stdout + + # Find memory usage like "XXXXMiB / YYYYMiB" + # Captures used and total memory for each GPU + matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output) + + if not matches: + # No GPUs found in expected format + return 0 + + total_free_vram_mib = 0 + for used_mem_str, total_mem_str in matches: + try: + used_mib = int(used_mem_str) + total_mib = int(total_mem_str) + total_free_vram_mib += (total_mib - used_mib) + except ValueError: + # Skip malformed entries + pass + + return total_free_vram_mib + + except FileNotFoundError: + raise + # nvidia-smi not found (likely no NVIDIA drivers installed) + return -1 + except Exception: + raise + # Handle any other unexpected exceptions + return -1 diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 81ad1a53..2353f39c 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -11,6 +11,7 @@ from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model from modules.models_settings import ( apply_model_settings_to_state, + estimate_vram, get_model_metadata, save_instruction_template, save_model_settings, @@ -44,6 +45,7 @@ def create_ui(): shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) with gr.Column(): + shared.gradio['vram_info'] = gr.HTML(value=lambda: estimate_vram_wrapper(shared.args.model, shared.args.gpu_layers, shared.args.ctx_size, shared.args.cache_type)) shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) @@ -105,7 +107,6 @@ def create_ui(): ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu) shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu) - with gr.Column(): with gr.Tab("Download"): shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu) @@ -148,6 +149,11 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False) + shared.gradio['model_menu'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) + shared.gradio['gpu_layers'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) + shared.gradio['ctx_size'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) + shared.gradio['cache_type'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) + if not shared.args.portable: shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False) @@ -275,6 +281,14 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur yield traceback.format_exc().replace('\n', '\n\n') +def estimate_vram_wrapper(model, gpu_layers, ctx_size, cache_type): + if model in ["None", None]: + return "
Estimated VRAM to load the model:" + + result = estimate_vram(model, gpu_layers, ctx_size, cache_type) + return f"
Estimated VRAM to load the model: {result:.0f} MiB" + + def update_truncation_length(current_length, state): if 'loader' in state: if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp': diff --git a/server.py b/server.py index b0b9e633..c35d65a8 100644 --- a/server.py +++ b/server.py @@ -49,8 +49,10 @@ from modules.extensions import apply_extensions from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model_if_idle from modules.models_settings import ( + estimate_vram, get_fallback_settings, get_model_metadata, + get_nvidia_free_vram, update_model_parameters ) from modules.shared import do_cmd_flags_warnings @@ -248,6 +250,16 @@ if __name__ == "__main__": model_settings = get_model_metadata(model_name) update_model_parameters(model_settings, initial=True) # hijack the command-line arguments + if 'gpu_layers' not in shared.provided_arguments: + available_vram = get_nvidia_free_vram() + if available_vram > 0: + n_layers = model_settings['gpu_layers'] + tolerance = 906 + while n_layers > 0 and estimate_vram(model_name, n_layers, shared.args.ctx_size, shared.args.cache_type) > available_vram - tolerance: + n_layers -= 1 + + shared.args.gpu_layers = n_layers + # Load the model shared.model, shared.tokenizer = load_model(model_name) if shared.args.lora: