2025-04-20 20:32:48 +02:00
import sys
2023-02-23 17:28:30 +01:00
import time
2023-02-23 18:41:42 +01:00
import modules . shared as shared
2023-05-22 03:42:34 +02:00
from modules . logging_colors import logger
2023-09-11 23:49:30 +02:00
from modules . models_settings import get_model_metadata
2025-08-22 20:46:02 +02:00
from modules . utils import resolve_model_path
2023-02-23 17:28:30 +01:00
2024-05-20 04:29:39 +02:00
last_generation_time = time . time ( )
2023-06-17 00:00:37 +02:00
def load_model ( model_name , loader = None ) :
2024-02-06 17:22:08 +01:00
logger . info ( f " Loading \" { model_name } \" " )
2023-02-23 17:28:30 +01:00
t0 = time . time ( )
2023-06-17 00:00:37 +02:00
shared . is_seq2seq = False
2023-12-08 15:35:23 +01:00
shared . model_name = model_name
2023-06-17 00:00:37 +02:00
load_func_map = {
2025-04-18 14:59:37 +02:00
' llama.cpp ' : llama_cpp_server_loader ,
2025-04-20 18:33:47 +02:00
' Transformers ' : transformers_loader ,
2025-04-09 05:07:08 +02:00
' ExLlamav3_HF ' : ExLlamav3_HF_loader ,
2025-08-09 04:31:16 +02:00
' ExLlamav3 ' : ExLlamav3_loader ,
2023-09-12 19:33:07 +02:00
' ExLlamav2_HF ' : ExLlamav2_HF_loader ,
2025-04-09 05:07:08 +02:00
' ExLlamav2 ' : ExLlamav2_loader ,
2024-06-24 07:30:03 +02:00
' TensorRT-LLM ' : TensorRT_LLM_loader ,
2025-10-24 07:53:23 +02:00
' ktransformers ' : ktransformers_loader ,
2023-06-17 00:00:37 +02:00
}
2023-11-06 06:38:29 +01:00
metadata = get_model_metadata ( model_name )
2023-06-17 00:00:37 +02:00
if loader is None :
if shared . args . loader is not None :
loader = shared . args . loader
else :
2023-11-06 06:38:29 +01:00
loader = metadata [ ' loader ' ]
2023-06-17 00:00:37 +02:00
if loader is None :
logger . error ( ' The path to the model does not exist. Exiting. ' )
2023-11-08 05:58:06 +01:00
raise ValueError
2023-05-17 00:52:22 +02:00
2025-04-20 20:32:48 +02:00
if loader != ' llama.cpp ' and ' sampler_hijack ' not in sys . modules :
from modules import sampler_hijack
sampler_hijack . hijack_samplers ( )
2023-06-17 00:00:37 +02:00
shared . args . loader = loader
output = load_func_map [ loader ] ( model_name )
2023-05-17 00:52:22 +02:00
if type ( output ) is tuple :
model , tokenizer = output
else :
model = output
2025-09-02 19:16:26 +02:00
if model is not None :
2025-04-20 18:33:47 +02:00
from modules . transformers_loader import load_tokenizer
2024-08-07 04:41:18 +02:00
tokenizer = load_tokenizer ( model_name )
2023-05-17 00:52:22 +02:00
2025-09-02 19:16:26 +02:00
if model is None :
return None , None
2023-11-06 06:38:29 +01:00
shared . settings . update ( { k : v for k , v in metadata . items ( ) if k in shared . settings } )
2025-04-26 01:59:03 +02:00
if loader . lower ( ) . startswith ( ' exllama ' ) or loader . lower ( ) . startswith ( ' tensorrt ' ) or loader == ' llama.cpp ' :
shared . settings [ ' truncation_length ' ] = shared . args . ctx_size
2023-11-16 01:00:51 +01:00
2025-08-19 15:50:40 +02:00
shared . is_multimodal = False
2025-08-28 20:13:19 +02:00
if loader . lower ( ) in ( ' exllamav3 ' , ' llama.cpp ' ) and hasattr ( model , ' is_multimodal ' ) :
2025-08-13 21:47:27 +02:00
shared . is_multimodal = model . is_multimodal ( )
2024-05-03 17:10:44 +02:00
logger . info ( f " Loaded \" { model_name } \" in { ( time . time ( ) - t0 ) : .2f } seconds. " )
2024-02-06 15:31:27 +01:00
logger . info ( f " LOADER: \" { loader } \" " )
2023-11-16 01:13:36 +01:00
logger . info ( f " TRUNCATION LENGTH: { shared . settings [ ' truncation_length ' ] } " )
2024-02-06 15:31:27 +01:00
logger . info ( f " INSTRUCTION TEMPLATE: \" { metadata [ ' instruction_template ' ] } \" " )
2023-05-17 00:52:22 +02:00
return model , tokenizer
2025-04-18 14:59:37 +02:00
def llama_cpp_server_loader ( model_name ) :
from modules . llama_cpp_server import LlamaServer
2023-05-17 00:52:22 +02:00
2025-08-22 20:46:02 +02:00
path = resolve_model_path ( model_name )
2023-05-17 00:52:22 +02:00
if path . is_file ( ) :
model_file = path
2023-02-23 17:28:30 +01:00
else :
2025-08-22 20:46:02 +02:00
gguf_files = sorted ( path . glob ( ' *.gguf ' ) )
if not gguf_files :
logger . error ( f " No .gguf models found in the directory: { path } " )
return None , None
model_file = gguf_files [ 0 ]
2023-02-23 17:28:30 +01:00
2025-04-18 14:59:37 +02:00
try :
model = LlamaServer ( model_file )
return model , model
except Exception as e :
logger . error ( f " Error loading the model with llama.cpp: { str ( e ) } " )
2025-08-22 20:46:02 +02:00
return None , None
2023-07-16 07:21:13 +02:00
2025-04-20 18:33:47 +02:00
def transformers_loader ( model_name ) :
from modules . transformers_loader import load_model_HF
return load_model_HF ( model_name )
2025-04-09 05:07:08 +02:00
def ExLlamav3_HF_loader ( model_name ) :
from modules . exllamav3_hf import Exllamav3HF
2024-02-06 15:21:17 +01:00
2025-04-09 05:07:08 +02:00
return Exllamav3HF . from_pretrained ( model_name )
2024-02-06 15:21:17 +01:00
2025-08-09 04:31:16 +02:00
def ExLlamav3_loader ( model_name ) :
from modules . exllamav3 import Exllamav3Model
2025-10-09 20:24:25 +02:00
model , tokenizer = Exllamav3Model . from_pretrained ( model_name )
2025-08-09 04:31:16 +02:00
return model , tokenizer
2023-09-12 19:33:07 +02:00
def ExLlamav2_HF_loader ( model_name ) :
from modules . exllamav2_hf import Exllamav2HF
return Exllamav2HF . from_pretrained ( model_name )
2025-04-09 05:07:08 +02:00
def ExLlamav2_loader ( model_name ) :
from modules . exllamav2 import Exllamav2Model
model , tokenizer = Exllamav2Model . from_pretrained ( model_name )
return model , tokenizer
2024-06-24 07:30:03 +02:00
def TensorRT_LLM_loader ( model_name ) :
2024-09-29 05:30:24 +02:00
try :
from modules . tensorrt_llm import TensorRTLLMModel
except ModuleNotFoundError :
raise ModuleNotFoundError ( " Failed to import ' tensorrt_llm ' . Please install it manually following the instructions in the TensorRT-LLM GitHub repository. " )
2024-06-24 07:30:03 +02:00
model = TensorRTLLMModel . from_pretrained ( model_name )
return model
2025-10-24 07:53:23 +02:00
def ktransformers_loader ( model_name ) :
try :
import ktransformers # aktiviert die Patches / Beschleuniger
except ModuleNotFoundError as e :
from modules . logging_colors import logger
logger . error ( " KTransformers ist nicht installiert: pip install ktransformers " )
raise
from modules . transformers_loader import load_model_HF
return load_model_HF ( model_name )
2024-07-29 03:30:06 +02:00
def unload_model ( keep_model_name = False ) :
2025-04-21 05:00:56 +02:00
if shared . model is None :
return
2025-08-18 18:05:47 +02:00
model_class_name = shared . model . __class__ . __name__
is_llamacpp = ( model_class_name == ' LlamaServer ' )
if model_class_name in [ ' Exllamav3Model ' , ' Exllamav3HF ' ] :
2025-08-09 04:31:16 +02:00
shared . model . unload ( )
2025-08-18 18:05:47 +02:00
elif model_class_name in [ ' Exllamav2Model ' , ' Exllamav2HF ' ] and hasattr ( shared . model , ' unload ' ) :
2025-05-30 21:05:49 +02:00
shared . model . unload ( )
2025-04-20 18:33:47 +02:00
2023-04-08 02:36:04 +02:00
shared . model = shared . tokenizer = None
2023-07-03 22:39:06 +02:00
shared . lora_names = [ ]
2023-07-12 20:29:43 +02:00
shared . model_dirty_from_training = False
2025-05-30 21:05:49 +02:00
2025-04-20 18:33:47 +02:00
if not is_llamacpp :
from modules . torch_utils import clear_torch_cache
clear_torch_cache ( )
2023-04-08 02:36:04 +02:00
2024-07-29 03:30:06 +02:00
if not keep_model_name :
shared . model_name = ' None '
2023-04-08 02:36:04 +02:00
def reload_model ( ) :
2023-04-08 02:37:41 +02:00
unload_model ( )
2023-04-08 02:36:04 +02:00
shared . model , shared . tokenizer = load_model ( shared . model_name )
2024-05-20 04:29:39 +02:00
def unload_model_if_idle ( ) :
global last_generation_time
logger . info ( f " Setting a timeout of { shared . args . idle_timeout } minutes to unload the model in case of inactivity. " )
while True :
shared . generation_lock . acquire ( )
try :
if time . time ( ) - last_generation_time > shared . args . idle_timeout * 60 :
if shared . model is not None :
logger . info ( " Unloading the model for inactivity. " )
2024-07-29 03:30:06 +02:00
unload_model ( keep_model_name = True )
2024-05-20 04:29:39 +02:00
finally :
shared . generation_lock . release ( )
time . sleep ( 60 )