2025-04-20 11:32:48 -07:00
import sys
2023-02-23 13:28:30 -03:00
import time
from pathlib import Path
2023-02-23 14:41:42 -03:00
import modules . shared as shared
2023-05-21 22:42:34 -03:00
from modules . logging_colors import logger
2023-09-11 18:49:30 -03:00
from modules . models_settings import get_model_metadata
2023-02-23 13:28:30 -03:00
2024-05-19 23:29:39 -03:00
last_generation_time = time . time ( )
2023-06-16 19:00:37 -03:00
def load_model ( model_name , loader = None ) :
2024-02-06 08:22:08 -08:00
logger . info ( f " Loading \" { model_name } \" " )
2023-02-23 13:28:30 -03:00
t0 = time . time ( )
2023-06-16 19:00:37 -03:00
shared . is_seq2seq = False
2023-12-08 06:35:23 -08:00
shared . model_name = model_name
2023-06-16 19:00:37 -03:00
load_func_map = {
2025-04-18 09:59:37 -03:00
' llama.cpp ' : llama_cpp_server_loader ,
2025-04-20 13:33:47 -03:00
' Transformers ' : transformers_loader ,
2025-04-09 00:07:08 -03:00
' ExLlamav3_HF ' : ExLlamav3_HF_loader ,
2023-09-12 14:33:07 -03:00
' ExLlamav2_HF ' : ExLlamav2_HF_loader ,
2025-04-09 00:07:08 -03:00
' ExLlamav2 ' : ExLlamav2_loader ,
2024-06-24 02:30:03 -03:00
' TensorRT-LLM ' : TensorRT_LLM_loader ,
2023-06-16 19:00:37 -03:00
}
2023-11-06 02:38:29 -03:00
metadata = get_model_metadata ( model_name )
2023-06-16 19:00:37 -03:00
if loader is None :
if shared . args . loader is not None :
loader = shared . args . loader
else :
2023-11-06 02:38:29 -03:00
loader = metadata [ ' loader ' ]
2023-06-16 19:00:37 -03:00
if loader is None :
logger . error ( ' The path to the model does not exist. Exiting. ' )
2023-11-07 20:58:06 -08:00
raise ValueError
2023-05-16 19:52:22 -03:00
2025-04-20 11:32:48 -07:00
if loader != ' llama.cpp ' and ' sampler_hijack ' not in sys . modules :
from modules import sampler_hijack
sampler_hijack . hijack_samplers ( )
2023-06-16 19:00:37 -03:00
shared . args . loader = loader
output = load_func_map [ loader ] ( model_name )
2023-05-16 19:52:22 -03:00
if type ( output ) is tuple :
model , tokenizer = output
else :
model = output
2023-05-19 11:20:08 -03:00
if model is None :
return None , None
else :
2025-04-20 13:33:47 -03:00
from modules . transformers_loader import load_tokenizer
2024-08-06 19:41:18 -07:00
tokenizer = load_tokenizer ( model_name )
2023-05-16 19:52:22 -03:00
2023-11-06 02:38:29 -03:00
shared . settings . update ( { k : v for k , v in metadata . items ( ) if k in shared . settings } )
2025-04-25 16:59:03 -07:00
if loader . lower ( ) . startswith ( ' exllama ' ) or loader . lower ( ) . startswith ( ' tensorrt ' ) or loader == ' llama.cpp ' :
shared . settings [ ' truncation_length ' ] = shared . args . ctx_size
2023-11-15 16:00:51 -08:00
2024-05-03 08:10:44 -07:00
logger . info ( f " Loaded \" { model_name } \" in { ( time . time ( ) - t0 ) : .2f } seconds. " )
2024-02-06 06:31:27 -08:00
logger . info ( f " LOADER: \" { loader } \" " )
2023-11-15 16:13:36 -08:00
logger . info ( f " TRUNCATION LENGTH: { shared . settings [ ' truncation_length ' ] } " )
2024-02-06 06:31:27 -08:00
logger . info ( f " INSTRUCTION TEMPLATE: \" { metadata [ ' instruction_template ' ] } \" " )
2023-05-16 19:52:22 -03:00
return model , tokenizer
2025-04-18 09:59:37 -03:00
def llama_cpp_server_loader ( model_name ) :
from modules . llama_cpp_server import LlamaServer
2023-05-16 19:52:22 -03:00
path = Path ( f ' { shared . args . model_dir } / { model_name } ' )
if path . is_file ( ) :
model_file = path
2023-02-23 13:28:30 -03:00
else :
2024-06-12 19:00:21 -07:00
model_file = sorted ( Path ( f ' { shared . args . model_dir } / { model_name } ' ) . glob ( ' *.gguf ' ) ) [ 0 ]
2023-02-23 13:28:30 -03:00
2025-04-18 09:59:37 -03:00
try :
model = LlamaServer ( model_file )
return model , model
except Exception as e :
logger . error ( f " Error loading the model with llama.cpp: { str ( e ) } " )
2023-07-16 02:21:13 -03:00
2025-04-20 13:33:47 -03:00
def transformers_loader ( model_name ) :
from modules . transformers_loader import load_model_HF
return load_model_HF ( model_name )
2025-04-09 00:07:08 -03:00
def ExLlamav3_HF_loader ( model_name ) :
from modules . exllamav3_hf import Exllamav3HF
2024-02-06 06:21:17 -08:00
2025-04-09 00:07:08 -03:00
return Exllamav3HF . from_pretrained ( model_name )
2024-02-06 06:21:17 -08:00
2023-09-12 14:33:07 -03:00
def ExLlamav2_HF_loader ( model_name ) :
from modules . exllamav2_hf import Exllamav2HF
return Exllamav2HF . from_pretrained ( model_name )
2025-04-09 00:07:08 -03:00
def ExLlamav2_loader ( model_name ) :
from modules . exllamav2 import Exllamav2Model
model , tokenizer = Exllamav2Model . from_pretrained ( model_name )
return model , tokenizer
2024-06-24 02:30:03 -03:00
def TensorRT_LLM_loader ( model_name ) :
2024-09-28 20:30:24 -07:00
try :
from modules . tensorrt_llm import TensorRTLLMModel
except ModuleNotFoundError :
raise ModuleNotFoundError ( " Failed to import ' tensorrt_llm ' . Please install it manually following the instructions in the TensorRT-LLM GitHub repository. " )
2024-06-24 02:30:03 -03:00
model = TensorRTLLMModel . from_pretrained ( model_name )
return model
2024-07-28 18:30:06 -07:00
def unload_model ( keep_model_name = False ) :
2025-04-20 20:00:56 -07:00
if shared . model is None :
return
2025-04-20 13:33:47 -03:00
is_llamacpp = ( shared . model . __class__ . __name__ == ' LlamaServer ' )
2025-06-01 19:27:14 -07:00
if shared . model . __class__ . __name__ == ' Exllamav3HF ' :
2025-05-30 12:05:49 -07:00
shared . model . unload ( )
2025-04-20 13:33:47 -03:00
2023-04-08 03:36:04 +03:00
shared . model = shared . tokenizer = None
2023-07-03 16:39:06 -04:00
shared . lora_names = [ ]
2023-07-12 14:29:43 -04:00
shared . model_dirty_from_training = False
2025-05-30 12:05:49 -07:00
2025-04-20 13:33:47 -03:00
if not is_llamacpp :
from modules . torch_utils import clear_torch_cache
clear_torch_cache ( )
2023-04-08 03:36:04 +03:00
2024-07-28 18:30:06 -07:00
if not keep_model_name :
shared . model_name = ' None '
2023-04-08 03:36:04 +03:00
def reload_model ( ) :
2023-04-07 21:37:41 -03:00
unload_model ( )
2023-04-08 03:36:04 +03:00
shared . model , shared . tokenizer = load_model ( shared . model_name )
2024-05-19 23:29:39 -03:00
def unload_model_if_idle ( ) :
global last_generation_time
logger . info ( f " Setting a timeout of { shared . args . idle_timeout } minutes to unload the model in case of inactivity. " )
while True :
shared . generation_lock . acquire ( )
try :
if time . time ( ) - last_generation_time > shared . args . idle_timeout * 60 :
if shared . model is not None :
logger . info ( " Unloading the model for inactivity. " )
2024-07-28 18:30:06 -07:00
unload_model ( keep_model_name = True )
2024-05-19 23:29:39 -03:00
finally :
shared . generation_lock . release ( )
time . sleep ( 60 )