TensorRT-LLM: Migrate from ModelRunner to LLM API, add concurrent API request support

2026-04-07 23:53:40 +00:00 · 2026-03-05 18:09:45 -08:00 · 2026-03-05 18:09:45 -08:00 · f52d9336e5
commit f52d9336e5
parent 9824c82cb6
7 changed files with 50 additions and 89 deletions
--- a/modules/models.py
+++ b/modules/models.py
@ -114,7 +114,7 @@ def TensorRT_LLM_loader(model_name):
        raise ModuleNotFoundError("Failed to import 'tensorrt_llm'. Please install it manually following the instructions in the TensorRT-LLM GitHub repository.")

    model = TensorRTLLMModel.from_pretrained(model_name)
-    return model
+    return model, model.tokenizer


 def unload_model(keep_model_name=False):
@ -124,7 +124,7 @@ def unload_model(keep_model_name=False):
    model_class_name = shared.model.__class__.__name__
    is_llamacpp = (model_class_name == 'LlamaServer')

-    if model_class_name in ['Exllamav3Model', 'Exllamav3HF']:
+    if model_class_name in ['Exllamav3Model', 'Exllamav3HF', 'TensorRTLLMModel']:
        shared.model.unload()

    shared.model = shared.tokenizer = None