mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-12-06 07:12:10 +01:00
Make --model work with absolute paths, eg --model /tmp/gemma-3-270m-it-IQ4_NL.gguf
This commit is contained in:
parent
fd41f2fafc
commit
f247c2ae62
|
|
@ -5,6 +5,7 @@ from pathlib import Path
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models_settings import get_model_metadata
|
from modules.models_settings import get_model_metadata
|
||||||
|
from modules.utils import resolve_model_path
|
||||||
|
|
||||||
last_generation_time = time.time()
|
last_generation_time = time.time()
|
||||||
|
|
||||||
|
|
@ -69,17 +70,24 @@ def load_model(model_name, loader=None):
|
||||||
def llama_cpp_server_loader(model_name):
|
def llama_cpp_server_loader(model_name):
|
||||||
from modules.llama_cpp_server import LlamaServer
|
from modules.llama_cpp_server import LlamaServer
|
||||||
|
|
||||||
path = Path(f'{shared.args.model_dir}/{model_name}')
|
path = resolve_model_path(model_name)
|
||||||
|
|
||||||
if path.is_file():
|
if path.is_file():
|
||||||
model_file = path
|
model_file = path
|
||||||
else:
|
else:
|
||||||
model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
|
gguf_files = sorted(path.glob('*.gguf'))
|
||||||
|
if not gguf_files:
|
||||||
|
logger.error(f"No .gguf models found in the directory: {path}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
model_file = gguf_files[0]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
model = LlamaServer(model_file)
|
model = LlamaServer(model_file)
|
||||||
return model, model
|
return model, model
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error loading the model with llama.cpp: {str(e)}")
|
logger.error(f"Error loading the model with llama.cpp: {str(e)}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
def transformers_loader(model_name):
|
def transformers_loader(model_name):
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ import yaml
|
||||||
|
|
||||||
from modules import chat, loaders, metadata_gguf, shared, ui
|
from modules import chat, loaders, metadata_gguf, shared, ui
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
|
from modules.utils import resolve_model_path
|
||||||
|
|
||||||
|
|
||||||
def get_fallback_settings():
|
def get_fallback_settings():
|
||||||
|
|
@ -26,6 +27,7 @@ def get_fallback_settings():
|
||||||
|
|
||||||
|
|
||||||
def get_model_metadata(model):
|
def get_model_metadata(model):
|
||||||
|
model_path = resolve_model_path(model)
|
||||||
model_settings = {}
|
model_settings = {}
|
||||||
|
|
||||||
# Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
|
# Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
|
||||||
|
|
@ -35,7 +37,7 @@ def get_model_metadata(model):
|
||||||
for k in settings[pat]:
|
for k in settings[pat]:
|
||||||
model_settings[k] = settings[pat][k]
|
model_settings[k] = settings[pat][k]
|
||||||
|
|
||||||
path = Path(f'{shared.args.model_dir}/{model}/config.json')
|
path = model_path / 'config.json'
|
||||||
if path.exists():
|
if path.exists():
|
||||||
hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read())
|
hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read())
|
||||||
else:
|
else:
|
||||||
|
|
@ -51,7 +53,7 @@ def get_model_metadata(model):
|
||||||
|
|
||||||
# GGUF metadata
|
# GGUF metadata
|
||||||
if model_settings['loader'] == 'llama.cpp':
|
if model_settings['loader'] == 'llama.cpp':
|
||||||
path = Path(f'{shared.args.model_dir}/{model}')
|
path = model_path
|
||||||
if path.is_file():
|
if path.is_file():
|
||||||
model_file = path
|
model_file = path
|
||||||
else:
|
else:
|
||||||
|
|
@ -130,18 +132,18 @@ def get_model_metadata(model):
|
||||||
model_settings['bf16'] = True
|
model_settings['bf16'] = True
|
||||||
|
|
||||||
# Try to find the Jinja instruct template
|
# Try to find the Jinja instruct template
|
||||||
path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
|
path = model_path / 'tokenizer_config.json'
|
||||||
template = None
|
template = None
|
||||||
|
|
||||||
# 1. Prioritize reading from chat_template.jinja if it exists
|
# 1. Prioritize reading from chat_template.jinja if it exists
|
||||||
jinja_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.jinja'
|
jinja_path = model_path / 'chat_template.jinja'
|
||||||
if jinja_path.exists():
|
if jinja_path.exists():
|
||||||
with open(jinja_path, 'r', encoding='utf-8') as f:
|
with open(jinja_path, 'r', encoding='utf-8') as f:
|
||||||
template = f.read()
|
template = f.read()
|
||||||
|
|
||||||
# 2. If no .jinja file, try chat_template.json
|
# 2. If no .jinja file, try chat_template.json
|
||||||
if template is None:
|
if template is None:
|
||||||
json_template_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.json'
|
json_template_path = model_path / 'chat_template.json'
|
||||||
if json_template_path.exists():
|
if json_template_path.exists():
|
||||||
with open(json_template_path, 'r', encoding='utf-8') as f:
|
with open(json_template_path, 'r', encoding='utf-8') as f:
|
||||||
json_data = json.load(f)
|
json_data = json.load(f)
|
||||||
|
|
@ -201,7 +203,7 @@ def get_model_metadata(model):
|
||||||
|
|
||||||
|
|
||||||
def infer_loader(model_name, model_settings, hf_quant_method=None):
|
def infer_loader(model_name, model_settings, hf_quant_method=None):
|
||||||
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
path_to_model = resolve_model_path(model_name)
|
||||||
if not path_to_model.exists():
|
if not path_to_model.exists():
|
||||||
loader = None
|
loader = None
|
||||||
elif shared.args.portable:
|
elif shared.args.portable:
|
||||||
|
|
@ -357,7 +359,7 @@ def get_model_size_mb(model_file: Path) -> float:
|
||||||
|
|
||||||
|
|
||||||
def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
|
def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
|
||||||
model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
|
model_file = resolve_model_path(gguf_file)
|
||||||
metadata = load_gguf_metadata_with_cache(model_file)
|
metadata = load_gguf_metadata_with_cache(model_file)
|
||||||
size_in_mb = get_model_size_mb(model_file)
|
size_in_mb = get_model_size_mb(model_file)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -86,6 +86,19 @@ def check_model_loaded():
|
||||||
return True, None
|
return True, None
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_model_path(model_name_or_path):
|
||||||
|
"""
|
||||||
|
Resolves a model path, checking for a direct path
|
||||||
|
before the default models directory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
path_candidate = Path(model_name_or_path)
|
||||||
|
if path_candidate.exists():
|
||||||
|
return path_candidate
|
||||||
|
else:
|
||||||
|
return Path(f'{shared.args.model_dir}/{model_name_or_path}')
|
||||||
|
|
||||||
|
|
||||||
def get_available_models():
|
def get_available_models():
|
||||||
# Get all GGUF files
|
# Get all GGUF files
|
||||||
gguf_files = get_available_ggufs()
|
gguf_files = get_available_ggufs()
|
||||||
|
|
|
||||||
13
server.py
13
server.py
|
|
@ -283,21 +283,14 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# If any model has been selected, load it
|
# If any model has been selected, load it
|
||||||
if shared.model_name != 'None':
|
if shared.model_name != 'None':
|
||||||
p = Path(shared.model_name)
|
model_settings = get_model_metadata(shared.model_name)
|
||||||
if p.exists():
|
|
||||||
model_name = p.parts[-1]
|
|
||||||
shared.model_name = model_name
|
|
||||||
else:
|
|
||||||
model_name = shared.model_name
|
|
||||||
|
|
||||||
model_settings = get_model_metadata(model_name)
|
|
||||||
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
|
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
|
||||||
|
|
||||||
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
|
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
|
||||||
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
|
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
|
||||||
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
|
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
|
||||||
shared.args.loader,
|
shared.args.loader,
|
||||||
model_name,
|
shared.model_name,
|
||||||
model_settings['gpu_layers'],
|
model_settings['gpu_layers'],
|
||||||
shared.args.ctx_size,
|
shared.args.ctx_size,
|
||||||
shared.args.cache_type,
|
shared.args.cache_type,
|
||||||
|
|
@ -308,7 +301,7 @@ if __name__ == "__main__":
|
||||||
shared.args.gpu_layers = adjusted_layers
|
shared.args.gpu_layers = adjusted_layers
|
||||||
|
|
||||||
# Load the model
|
# Load the model
|
||||||
shared.model, shared.tokenizer = load_model(model_name)
|
shared.model, shared.tokenizer = load_model(shared.model_name)
|
||||||
if shared.args.lora:
|
if shared.args.lora:
|
||||||
add_lora_to_model(shared.args.lora)
|
add_lora_to_model(shared.args.lora)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue