2025-04-20 20:32:48 +02:00
import sys
2023-02-23 17:28:30 +01:00
import time
from pathlib import Path
2023-02-23 18:41:42 +01:00
import modules . shared as shared
2023-05-22 03:42:34 +02:00
from modules . logging_colors import logger
2023-09-11 23:49:30 +02:00
from modules . models_settings import get_model_metadata
2023-02-23 17:28:30 +01:00
2024-05-20 04:29:39 +02:00
last_generation_time = time . time ( )
2023-06-17 00:00:37 +02:00
def load_model ( model_name , loader = None ) :
2024-02-06 17:22:08 +01:00
logger . info ( f " Loading \" { model_name } \" " )
2023-02-23 17:28:30 +01:00
t0 = time . time ( )
2023-06-17 00:00:37 +02:00
shared . is_seq2seq = False
2023-12-08 15:35:23 +01:00
shared . model_name = model_name
2023-06-17 00:00:37 +02:00
load_func_map = {
2025-04-18 14:59:37 +02:00
' llama.cpp ' : llama_cpp_server_loader ,
2025-04-20 18:33:47 +02:00
' Transformers ' : transformers_loader ,
2025-04-09 05:07:08 +02:00
' ExLlamav3_HF ' : ExLlamav3_HF_loader ,
2023-09-12 19:33:07 +02:00
' ExLlamav2_HF ' : ExLlamav2_HF_loader ,
2025-04-09 05:07:08 +02:00
' ExLlamav2 ' : ExLlamav2_loader ,
2023-12-19 01:23:16 +01:00
' HQQ ' : HQQ_loader ,
2024-06-24 07:30:03 +02:00
' TensorRT-LLM ' : TensorRT_LLM_loader ,
2023-06-17 00:00:37 +02:00
}
2023-11-06 06:38:29 +01:00
metadata = get_model_metadata ( model_name )
2023-06-17 00:00:37 +02:00
if loader is None :
if shared . args . loader is not None :
loader = shared . args . loader
else :
2023-11-06 06:38:29 +01:00
loader = metadata [ ' loader ' ]
2023-06-17 00:00:37 +02:00
if loader is None :
logger . error ( ' The path to the model does not exist. Exiting. ' )
2023-11-08 05:58:06 +01:00
raise ValueError
2023-05-17 00:52:22 +02:00
2025-04-20 20:32:48 +02:00
if loader != ' llama.cpp ' and ' sampler_hijack ' not in sys . modules :
from modules import sampler_hijack
sampler_hijack . hijack_samplers ( )
2023-06-17 00:00:37 +02:00
shared . args . loader = loader
output = load_func_map [ loader ] ( model_name )
2023-05-17 00:52:22 +02:00
if type ( output ) is tuple :
model , tokenizer = output
else :
model = output
2023-05-19 16:20:08 +02:00
if model is None :
return None , None
else :
2025-04-20 18:33:47 +02:00
from modules . transformers_loader import load_tokenizer
2024-08-07 04:41:18 +02:00
tokenizer = load_tokenizer ( model_name )
2023-05-17 00:52:22 +02:00
2023-11-06 06:38:29 +01:00
shared . settings . update ( { k : v for k , v in metadata . items ( ) if k in shared . settings } )
2024-06-24 07:30:03 +02:00
if loader . lower ( ) . startswith ( ' exllama ' ) or loader . lower ( ) . startswith ( ' tensorrt ' ) :
2023-11-16 01:00:51 +01:00
shared . settings [ ' truncation_length ' ] = shared . args . max_seq_len
2025-04-18 14:59:37 +02:00
elif loader == ' llama.cpp ' :
2023-11-16 01:00:51 +01:00
shared . settings [ ' truncation_length ' ] = shared . args . n_ctx
2024-05-03 17:10:44 +02:00
logger . info ( f " Loaded \" { model_name } \" in { ( time . time ( ) - t0 ) : .2f } seconds. " )
2024-02-06 15:31:27 +01:00
logger . info ( f " LOADER: \" { loader } \" " )
2023-11-16 01:13:36 +01:00
logger . info ( f " TRUNCATION LENGTH: { shared . settings [ ' truncation_length ' ] } " )
2024-02-06 15:31:27 +01:00
logger . info ( f " INSTRUCTION TEMPLATE: \" { metadata [ ' instruction_template ' ] } \" " )
2023-05-17 00:52:22 +02:00
return model , tokenizer
2025-04-18 14:59:37 +02:00
def llama_cpp_server_loader ( model_name ) :
from modules . llama_cpp_server import LlamaServer
2023-05-17 00:52:22 +02:00
path = Path ( f ' { shared . args . model_dir } / { model_name } ' )
if path . is_file ( ) :
model_file = path
2023-02-23 17:28:30 +01:00
else :
2024-06-13 04:00:21 +02:00
model_file = sorted ( Path ( f ' { shared . args . model_dir } / { model_name } ' ) . glob ( ' *.gguf ' ) ) [ 0 ]
2023-02-23 17:28:30 +01:00
2024-02-06 17:22:08 +01:00
logger . info ( f " llama.cpp weights detected: \" { model_file } \" " )
2025-04-18 14:59:37 +02:00
try :
model = LlamaServer ( model_file )
return model , model
except Exception as e :
logger . error ( f " Error loading the model with llama.cpp: { str ( e ) } " )
2023-07-16 07:21:13 +02:00
2025-04-20 18:33:47 +02:00
def transformers_loader ( model_name ) :
from modules . transformers_loader import load_model_HF
return load_model_HF ( model_name )
2025-04-09 05:07:08 +02:00
def ExLlamav3_HF_loader ( model_name ) :
from modules . exllamav3_hf import Exllamav3HF
2024-02-06 15:21:17 +01:00
2025-04-09 05:07:08 +02:00
return Exllamav3HF . from_pretrained ( model_name )
2024-02-06 15:21:17 +01:00
2023-09-12 19:33:07 +02:00
def ExLlamav2_HF_loader ( model_name ) :
from modules . exllamav2_hf import Exllamav2HF
return Exllamav2HF . from_pretrained ( model_name )
2025-04-09 05:07:08 +02:00
def ExLlamav2_loader ( model_name ) :
from modules . exllamav2 import Exllamav2Model
model , tokenizer = Exllamav2Model . from_pretrained ( model_name )
return model , tokenizer
2023-12-19 01:23:16 +01:00
def HQQ_loader ( model_name ) :
2024-09-29 05:30:24 +02:00
try :
from hqq . core . quantize import HQQBackend , HQQLinear
from hqq . models . hf . base import AutoHQQHFModel
except ModuleNotFoundError :
raise ModuleNotFoundError ( " Failed to import ' hqq ' . Please install it manually following the instructions in the HQQ GitHub repository. " )
2023-12-19 01:23:16 +01:00
2024-02-06 17:22:08 +01:00
logger . info ( f " Loading HQQ model with backend: \" { shared . args . hqq_backend } \" " )
2023-12-19 01:23:16 +01:00
model_dir = Path ( f ' { shared . args . model_dir } / { model_name } ' )
2024-05-21 18:32:02 +02:00
model = AutoHQQHFModel . from_quantized ( str ( model_dir ) )
2023-12-19 01:23:16 +01:00
HQQLinear . set_backend ( getattr ( HQQBackend , shared . args . hqq_backend ) )
return model
2024-06-24 07:30:03 +02:00
def TensorRT_LLM_loader ( model_name ) :
2024-09-29 05:30:24 +02:00
try :
from modules . tensorrt_llm import TensorRTLLMModel
except ModuleNotFoundError :
raise ModuleNotFoundError ( " Failed to import ' tensorrt_llm ' . Please install it manually following the instructions in the TensorRT-LLM GitHub repository. " )
2024-06-24 07:30:03 +02:00
model = TensorRTLLMModel . from_pretrained ( model_name )
return model
2024-07-29 03:30:06 +02:00
def unload_model ( keep_model_name = False ) :
2025-04-20 18:33:47 +02:00
is_llamacpp = ( shared . model . __class__ . __name__ == ' LlamaServer ' )
2023-04-08 02:36:04 +02:00
shared . model = shared . tokenizer = None
2023-07-03 22:39:06 +02:00
shared . lora_names = [ ]
2023-07-12 20:29:43 +02:00
shared . model_dirty_from_training = False
2025-04-20 18:33:47 +02:00
if not is_llamacpp :
from modules . torch_utils import clear_torch_cache
clear_torch_cache ( )
2023-04-08 02:36:04 +02:00
2024-07-29 03:30:06 +02:00
if not keep_model_name :
shared . model_name = ' None '
2023-04-08 02:36:04 +02:00
def reload_model ( ) :
2023-04-08 02:37:41 +02:00
unload_model ( )
2023-04-08 02:36:04 +02:00
shared . model , shared . tokenizer = load_model ( shared . model_name )
2024-05-20 04:29:39 +02:00
def unload_model_if_idle ( ) :
global last_generation_time
logger . info ( f " Setting a timeout of { shared . args . idle_timeout } minutes to unload the model in case of inactivity. " )
while True :
shared . generation_lock . acquire ( )
try :
if time . time ( ) - last_generation_time > shared . args . idle_timeout * 60 :
if shared . model is not None :
logger . info ( " Unloading the model for inactivity. " )
2024-07-29 03:30:06 +02:00
unload_model ( keep_model_name = True )
2024-05-20 04:29:39 +02:00
finally :
shared . generation_lock . release ( )
time . sleep ( 60 )