diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index baf474bd..4bf57ee8 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -329,7 +329,6 @@ class LlamaServer:
self.server_path,
"--model", self.model_path,
"--ctx-size", str(shared.args.ctx_size),
- "--gpu-layers", str(shared.args.gpu_layers),
"--batch-size", str(shared.args.batch_size),
"--ubatch-size", str(shared.args.ubatch_size),
"--port", str(self.port),
@@ -337,6 +336,11 @@ class LlamaServer:
"--flash-attn", "on",
]
+ if shared.args.gpu_layers > 0:
+ cmd += ["--gpu-layers", str(shared.args.gpu_layers), "--fit", "off"]
+ else:
+ cmd += ["--fit", "on"]
+
if shared.args.threads > 0:
cmd += ["--threads", str(shared.args.threads)]
if shared.args.threads_batch > 0:
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 15ff2830..3b28a800 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -1,7 +1,6 @@
import functools
import json
import re
-import subprocess
from math import floor
from pathlib import Path
@@ -78,7 +77,7 @@ def get_model_metadata(model):
elif k.endswith('rope.scaling.factor'):
model_settings['compress_pos_emb'] = metadata[k]
elif k.endswith('.block_count'):
- model_settings['gpu_layers'] = metadata[k] + 1
+ model_settings['gpu_layers'] = 0
model_settings['max_gpu_layers'] = metadata[k] + 1
if 'tokenizer.chat_template' in metadata:
@@ -265,16 +264,18 @@ def apply_model_settings_to_state(model, state):
# Handle GPU layers and VRAM update for llama.cpp
if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
- vram_info, gpu_layers_update = update_gpu_layers_and_vram(
+ gpu_layers = model_settings['gpu_layers'] # 0 (auto) by default, or user-saved value
+ max_layers = model_settings.get('max_gpu_layers', 256)
+ state['gpu_layers'] = gr.update(value=gpu_layers, maximum=max_layers)
+
+ vram_info = update_gpu_layers_and_vram(
state['loader'],
model,
- model_settings['gpu_layers'],
+ gpu_layers,
state['ctx_size'],
state['cache_type'],
- auto_adjust=True
)
- state['gpu_layers'] = gpu_layers_update
state['vram_info'] = vram_info
return state
@@ -412,120 +413,13 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
return vram
-def get_nvidia_vram(return_free=True):
+def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type):
"""
- Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output.
-
- Args:
- return_free (bool): If True, returns free VRAM. If False, returns total VRAM.
-
- Returns:
- int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs.
- Returns -1 if nvidia-smi command fails (not found, error, etc.).
- Returns 0 if nvidia-smi succeeds but no GPU memory info found.
- """
- try:
- # Execute nvidia-smi command
- result = subprocess.run(
- ['nvidia-smi'],
- capture_output=True,
- text=True,
- check=False
- )
-
- # Check if nvidia-smi returned an error
- if result.returncode != 0:
- return -1
-
- # Parse the output for memory usage patterns
- output = result.stdout
-
- # Find memory usage like "XXXXMiB / YYYYMiB"
- # Captures used and total memory for each GPU
- matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output)
-
- if not matches:
- # No GPUs found in expected format
- return 0
-
- total_vram_mib = 0
- total_free_vram_mib = 0
-
- for used_mem_str, total_mem_str in matches:
- try:
- used_mib = int(used_mem_str)
- total_mib = int(total_mem_str)
- total_vram_mib += total_mib
- total_free_vram_mib += (total_mib - used_mib)
- except ValueError:
- # Skip malformed entries
- pass
-
- # Return either free or total VRAM based on the flag
- return total_free_vram_mib if return_free else total_vram_mib
-
- except FileNotFoundError:
- # nvidia-smi not found (likely no NVIDIA drivers installed)
- return -1
- except Exception:
- # Handle any other unexpected exceptions
- return -1
-
-
-def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True):
- """
- Unified function to handle GPU layers and VRAM updates.
-
- Args:
- for_ui: If True, returns Gradio updates. If False, returns raw values.
-
- Returns:
- - If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
- - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
+ Compute the estimated VRAM usage for the given GPU layers and return
+ an HTML string for the UI display.
"""
if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"):
- vram_info = "
Estimated VRAM to load the model:
"
- if for_ui:
- return (vram_info, gr.update()) if auto_adjust else vram_info
- else:
- return (0, gpu_layers) if auto_adjust else 0
+ return "Estimated VRAM to load the model:
"
- # Get model settings including user preferences
- model_settings = get_model_metadata(model)
-
- current_layers = gpu_layers
- max_layers = model_settings.get('max_gpu_layers', 256)
-
- if auto_adjust:
- # Check if this is a user-saved setting
- user_config = shared.user_config
- model_regex = Path(model).name + '$'
- has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex]
-
- if not has_user_setting:
- # No user setting, auto-adjust from the maximum
- current_layers = max_layers # Start from max
-
- # Auto-adjust based on available/total VRAM
- # If a model is loaded and it's for the UI, use the total VRAM to avoid confusion
- return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True
- available_vram = get_nvidia_vram(return_free=return_free)
- if available_vram > 0:
- tolerance = 577
- while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
- current_layers -= 1
-
- # Calculate VRAM with current layers
- vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
-
- if for_ui:
- vram_info = f"Estimated VRAM to load the model: {vram_usage:.0f} MiB
"
- if auto_adjust:
- return vram_info, gr.update(value=current_layers, maximum=max_layers)
- else:
- return vram_info
- else:
- if auto_adjust:
- return vram_usage, current_layers
- else:
- return vram_usage
+ vram_usage = estimate_vram(model, gpu_layers, ctx_size, cache_type)
+ return f"Estimated VRAM to load the model: {vram_usage:.0f} MiB
"
diff --git a/modules/shared.py b/modules/shared.py
index 88e4b182..2fba530b 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -84,7 +84,7 @@ group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the pr
# llama.cpp
group = parser.add_argument_group('llama.cpp')
-group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
+group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=0, metavar='N', help='Number of layers to offload to the GPU. 0 means auto (llama.cpp decides via --fit).')
group.add_argument('--cpu-moe', action='store_true', help='Move the experts to the CPU (for MoE models).')
group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.')
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 86adc229..b9d731ee 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -41,7 +41,7 @@ def create_ui():
gr.Markdown("## Main options")
with gr.Row():
with gr.Column():
- shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
+ shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='0 = auto (llama.cpp decides via --fit). Set manually to override.')
shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
@@ -157,22 +157,22 @@ def create_event_handlers():
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then(
- partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False)
+ update_gpu_layers_and_vram, gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
shared.gradio['save_model_settings'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
- # For ctx_size and cache_type - auto-adjust GPU layers
+ # For ctx_size and cache_type - update VRAM display
for param in ['ctx_size', 'cache_type']:
shared.gradio[param].change(
- partial(update_gpu_layers_and_vram, auto_adjust=True),
+ update_gpu_layers_and_vram,
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
- gradio('vram_info', 'gpu_layers'), show_progress=False)
+ gradio('vram_info'), show_progress=False)
# For manual gpu_layers changes - only update VRAM
shared.gradio['gpu_layers'].change(
- partial(update_gpu_layers_and_vram, auto_adjust=False),
+ update_gpu_layers_and_vram,
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
gradio('vram_info'), show_progress=False)
@@ -386,8 +386,6 @@ def get_initial_vram_info():
shared.args.gpu_layers,
shared.args.ctx_size,
shared.args.cache_type,
- auto_adjust=False,
- for_ui=True
)
return "Estimated VRAM to load the model:
"
@@ -396,7 +394,7 @@ def get_initial_vram_info():
def get_initial_gpu_layers_max():
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
model_settings = get_model_metadata(shared.model_name)
- return model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', 256))
+ return model_settings.get('max_gpu_layers', 256)
return 256
diff --git a/server.py b/server.py
index 25776f2d..304a0478 100644
--- a/server.py
+++ b/server.py
@@ -62,7 +62,6 @@ from modules.models import load_model, unload_model_if_idle
from modules.models_settings import (
get_fallback_settings,
get_model_metadata,
- update_gpu_layers_and_vram,
update_model_parameters
)
from modules.shared import do_cmd_flags_warnings
@@ -315,20 +314,6 @@ if __name__ == "__main__":
model_settings = get_model_metadata(shared.model_name)
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
- # Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
- if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
- vram_usage, adjusted_layers = update_gpu_layers_and_vram(
- shared.args.loader,
- shared.model_name,
- model_settings['gpu_layers'],
- shared.args.ctx_size,
- shared.args.cache_type,
- auto_adjust=True,
- for_ui=False
- )
-
- shared.args.gpu_layers = adjusted_layers
-
# Load the model
shared.model, shared.tokenizer = load_model(shared.model_name)
if shared.args.lora: