text-generation-webui/modules/models_settings.py

import functools
import json
import re
from math import floor
from pathlib import Path

import gradio as gr
import yaml

from modules import chat, loaders, metadata_gguf, shared, ui
from modules.logging_colors import logger
from modules.utils import resolve_model_path


def get_fallback_settings():
    return {
        'bf16': False,
        'ctx_size': 8192,
        'rope_freq_base': 0,
        'compress_pos_emb': 1,
        'alpha_value': 1,
        'truncation_length': shared.settings['truncation_length'],
        'truncation_length_info': shared.settings['truncation_length'],
        'skip_special_tokens': shared.settings['skip_special_tokens'],
    }


def get_model_metadata(model):
    model_path = resolve_model_path(model)
    model_settings = {}

    # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
    settings = shared.model_config
    for pat in settings:
        if re.match(pat.lower(), Path(model).name.lower()):
            for k in settings[pat]:
                model_settings[k] = settings[pat][k]

    path = model_path / 'config.json'
    if path.exists():
        hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read())
    else:
        hf_metadata = None

    if 'loader' not in model_settings:
        quant_method = None if hf_metadata is None else hf_metadata.get("quantization_config", {}).get("quant_method", None)
        model_settings['loader'] = infer_loader(
            model,
            model_settings,
            hf_quant_method=quant_method
        )

    # GGUF metadata
    if model_settings['loader'] == 'llama.cpp':
        path = model_path
        if path.is_file():
            model_file = path
        else:
            gguf_files = list(path.glob('*.gguf'))
            if not gguf_files:
                error_msg = f"No .gguf models found in directory: {path}"
                logger.error(error_msg)
                raise FileNotFoundError(error_msg)

            model_file = gguf_files[0]

        metadata = load_gguf_metadata_with_cache(model_file)

        for k in metadata:
            if k.endswith('.context_length'):
                model_settings['ctx_size'] = min(metadata[k], 8192)
                model_settings['truncation_length_info'] = metadata[k]
            elif k.endswith('rope.freq_base'):
                model_settings['rope_freq_base'] = metadata[k]
            elif k.endswith('rope.scale_linear'):
                model_settings['compress_pos_emb'] = metadata[k]
            elif k.endswith('rope.scaling.factor'):
                model_settings['compress_pos_emb'] = metadata[k]
            elif k.endswith('.block_count'):
                model_settings['gpu_layers'] = -1
                model_settings['max_gpu_layers'] = metadata[k] + 1

        if 'tokenizer.chat_template' in metadata:
            template = metadata['tokenizer.chat_template']
            if 'tokenizer.ggml.eos_token_id' in metadata:
                eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']]
            else:
                eos_token = ""

            if 'tokenizer.ggml.bos_token_id' in metadata:
                bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']]
            else:
                bos_token = ""

            shared.bos_token = bos_token
            shared.eos_token = eos_token

            template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
            template = re.sub(r'raise_exception\([^)]*\)', "''", template)
            model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
            model_settings['instruction_template_str'] = template

    else:
        # Transformers metadata
        if hf_metadata is not None:
            metadata = json.loads(open(path, 'r', encoding='utf-8').read())
            if 'pretrained_config' in metadata:
                metadata = metadata['pretrained_config']

            for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']:
                if k in metadata:
                    value = metadata[k]
                elif k in metadata.get('text_config', {}):
                    value = metadata['text_config'][k]
                else:
                    continue

                model_settings['truncation_length'] = value
                model_settings['truncation_length_info'] = value
                model_settings['ctx_size'] = min(value, 8192)
                break

            if 'rope_theta' in metadata:
                model_settings['rope_freq_base'] = metadata['rope_theta']
            elif 'attn_config' in metadata and 'rope_theta' in metadata['attn_config']:
                model_settings['rope_freq_base'] = metadata['attn_config']['rope_theta']

            if 'rope_scaling' in metadata and isinstance(metadata['rope_scaling'], dict) and all(key in metadata['rope_scaling'] for key in ('type', 'factor')):
                if metadata['rope_scaling']['type'] == 'linear':
                    model_settings['compress_pos_emb'] = metadata['rope_scaling']['factor']

            if 'torch_dtype' in metadata and metadata['torch_dtype'] == 'bfloat16':
                model_settings['bf16'] = True

    # Try to find the Jinja instruct template
    path = model_path / 'tokenizer_config.json'
    template = None

    # 1. Prioritize reading from chat_template.jinja if it exists
    jinja_path = model_path / 'chat_template.jinja'
    if jinja_path.exists():
        with open(jinja_path, 'r', encoding='utf-8') as f:
            template = f.read()

    # 2. If no .jinja file, try chat_template.json
    if template is None:
        json_template_path = model_path / 'chat_template.json'
        if json_template_path.exists():
            with open(json_template_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)
                if 'chat_template' in json_data:
                    template = json_data['chat_template']

    # 3. Fall back to tokenizer_config.json metadata
    if path.exists():
        metadata = json.loads(open(path, 'r', encoding='utf-8').read())

        # Only read from metadata if we haven't already loaded from .jinja or .json
        if template is None and 'chat_template' in metadata:
            template = metadata['chat_template']
            if isinstance(template, list):
                template = template[0]['template']

        # 4. If a template was found from any source, process it
        if template:
            shared.bos_token = '<s>'
            shared.eos_token = '</s>'

            for k in ['eos_token', 'bos_token']:
                if k in metadata:
                    value = metadata[k]
                    if isinstance(value, dict):
                        value = value['content']

                    setattr(shared, k, value)

            template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
            template = re.sub(r'raise_exception\([^)]*\)', "''", template)
            model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
            model_settings['instruction_template_str'] = template

    if 'instruction_template' not in model_settings:
        model_settings['instruction_template'] = 'Alpaca'

    # Ignore rope_freq_base if set to the default value
    if 'rope_freq_base' in model_settings and model_settings['rope_freq_base'] == 10000:
        model_settings.pop('rope_freq_base')

    # Apply user settings from user_data/models/config-user.yaml
    settings = shared.user_config
    for pat in settings:
        if re.match(pat.lower(), Path(model).name.lower()):
            for k in settings[pat]:
                new_k = k
                if k == 'n_gpu_layers':
                    new_k = 'gpu_layers'

                model_settings[new_k] = settings[pat][k]

    # Load instruction template if defined by name rather than by value
    if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
        model_settings['instruction_template_str'] = chat.load_instruction_template(model_settings['instruction_template'])

    return model_settings


def infer_loader(model_name, model_settings, hf_quant_method=None):
    path_to_model = resolve_model_path(model_name)
    if not path_to_model.exists():
        loader = None
    elif shared.args.portable:
        loader = 'llama.cpp'
    elif len(list(path_to_model.glob('*.gguf'))) > 0:
        loader = 'llama.cpp'
    elif re.match(r'.*\.gguf', model_name.lower()):
        loader = 'llama.cpp'
    elif hf_quant_method == 'exl3':
        loader = 'ExLlamav3'
    elif hf_quant_method in ['exl2', 'gptq']:
        loader = 'ExLlamav2_HF'
    elif re.match(r'.*exl3', model_name.lower()):
        loader = 'ExLlamav3'
    elif re.match(r'.*exl2', model_name.lower()):
        loader = 'ExLlamav2_HF'
    else:
        loader = 'Transformers'

    return loader


def update_model_parameters(state, initial=False):
    '''
    UI: update the command-line arguments based on the interface values
    '''
    elements = ui.list_model_elements()  # the names of the parameters

    for i, element in enumerate(elements):
        if element not in state:
            continue

        value = state[element]
        if initial and element in shared.provided_arguments:
            continue

        if element == 'cpu_memory' and value == 0:
            value = vars(shared.args_defaults)[element]

        setattr(shared.args, element, value)


def apply_model_settings_to_state(model, state):
    '''
    UI: update the state variable with the model settings
    '''
    model_settings = get_model_metadata(model)
    if 'loader' in model_settings:
        loader = model_settings.pop('loader')
        if not ((loader == 'ExLlamav2_HF' and state['loader'] == 'ExLlamav2') or (loader == 'ExLlamav3_HF' and state['loader'] == 'ExLlamav3')):
            state['loader'] = loader

    for k in model_settings:
        if k in state and k != 'gpu_layers':  # Skip gpu_layers, handle separately
            state[k] = model_settings[k]

    # Handle GPU layers and VRAM update for llama.cpp
    if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
        gpu_layers = model_settings['gpu_layers']  # -1 (auto) by default, or user-saved value
        max_layers = model_settings.get('max_gpu_layers', 256)
        state['gpu_layers'] = gr.update(value=gpu_layers, maximum=max_layers)

        vram_info = update_gpu_layers_and_vram(
            state['loader'],
            model,
            gpu_layers,
            state['ctx_size'],
            state['cache_type'],
        )

        state['vram_info'] = vram_info

    return state


def save_model_settings(model, state):
    '''
    Save the settings for this model to user_data/models/config-user.yaml
    '''
    if model == 'None':
        yield ("Not saving the settings because no model is selected in the menu.")
        return

    user_config = shared.load_user_config()
    model_regex = Path(model).name + '$'  # For exact matches
    if model_regex not in user_config:
        user_config[model_regex] = {}

    for k in ui.list_model_elements():
        if k == 'loader' or k in loaders.loaders_and_params[state['loader']]:
            user_config[model_regex][k] = state[k]

    shared.user_config = user_config

    output = yaml.dump(user_config, sort_keys=False)
    p = Path(f'{shared.args.model_dir}/config-user.yaml')
    with open(p, 'w') as f:
        f.write(output)

    yield (f"Settings for `{model}` saved to `{p}`.")


def save_instruction_template(model, template):
    '''
    Similar to the function above, but it saves only the instruction template.
    '''
    if model == 'None':
        yield ("Not saving the template because no model is selected in the menu.")
        return

    user_config = shared.load_user_config()
    model_regex = Path(model).name + '$'  # For exact matches
    if model_regex not in user_config:
        user_config[model_regex] = {}

    if template == 'None':
        user_config[model_regex].pop('instruction_template', None)
    else:
        user_config[model_regex]['instruction_template'] = template

    shared.user_config = user_config

    output = yaml.dump(user_config, sort_keys=False)
    p = Path(f'{shared.args.model_dir}/config-user.yaml')
    with open(p, 'w') as f:
        f.write(output)

    if template == 'None':
        yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
    else:
        yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")


@functools.lru_cache(maxsize=1)
def load_gguf_metadata_with_cache(model_file):
    return metadata_gguf.load_metadata(model_file)


def get_model_size_mb(model_file: Path) -> float:
    filename = model_file.name

    # Check for multipart pattern
    match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename)

    if match:
        # It's a multipart file, find all matching parts
        base_pattern = match.group(1)
        part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf'))
        total_size = sum(p.stat().st_size for p in part_files)
    else:
        # Single part
        total_size = model_file.stat().st_size

    return total_size / (1024 ** 2)  # Return size in MB


def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
    model_file = resolve_model_path(gguf_file)
    metadata = load_gguf_metadata_with_cache(model_file)
    size_in_mb = get_model_size_mb(model_file)

    # Extract values from metadata
    n_layers = None
    n_kv_heads = None
    n_attention_heads = None  # Fallback for models without separate KV heads
    embedding_dim = None

    for key, value in metadata.items():
        if key.endswith('.block_count'):
            n_layers = value
        elif key.endswith('.attention.head_count_kv'):
            n_kv_heads = max(value) if isinstance(value, list) else value
        elif key.endswith('.attention.head_count'):
            n_attention_heads = max(value) if isinstance(value, list) else value
        elif key.endswith('.embedding_length'):
            embedding_dim = value

    if n_kv_heads is None:
        n_kv_heads = n_attention_heads

    if gpu_layers > n_layers:
        gpu_layers = n_layers

    # Convert cache_type to numeric
    if cache_type == 'q4_0':
        cache_type = 4
    elif cache_type == 'q8_0':
        cache_type = 8
    else:
        cache_type = 16

    # Derived features
    size_per_layer = size_in_mb / max(n_layers, 1e-6)
    kv_cache_factor = n_kv_heads * cache_type * ctx_size
    embedding_per_context = embedding_dim / ctx_size

    # Calculate VRAM using the model
    # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
    vram = (
        (size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor)
        * (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632)))
        + 1516.522943869404
    )

    return vram


def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type):
    """
    Compute the estimated VRAM usage for the given GPU layers and return
    an HTML string for the UI display.
    """
    if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf") or gpu_layers < 0 or ctx_size == 0:
        return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</div>"

    vram_usage = estimate_vram(model, gpu_layers, ctx_size, cache_type)
    return f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span></div>"