From f52d9336e53cd3149dc76e95c02925498915eabd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 5 Mar 2026 18:09:45 -0800
Subject: [PATCH] TensorRT-LLM: Migrate from ModelRunner to LLM API, add
 concurrent API request support

---
 modules/loaders.py         |   6 +-
 modules/models.py          |   4 +-
 modules/shared.py          |   4 --
 modules/tensorrt_llm.py    | 115 ++++++++++++-------------------------
 modules/text_generation.py |   2 +-
 modules/ui.py              |   1 -
 modules/ui_model_menu.py   |   7 ++-
 7 files changed, 50 insertions(+), 89 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index 0348c939..15b8dfeb 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -77,7 +77,6 @@ loaders_and_params = OrderedDict({
     ],
     'TensorRT-LLM': [
         'ctx_size',
-        'cpp_runner',
         'tensorrt_llm_info',
     ]
 })
@@ -252,11 +251,16 @@ loaders_samplers = {
         'temperature',
         'top_p',
         'top_k',
+        'min_p',
         'repetition_penalty',
         'frequency_penalty',
         'presence_penalty',
+        'no_repeat_ngram_size',
         'auto_max_new_tokens',
         'ban_eos_token',
+        'add_bos_token',
+        'skip_special_tokens',
+        'seed',
     }
 }
 
diff --git a/modules/models.py b/modules/models.py
index 9780b9f5..bc5585cf 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -114,7 +114,7 @@ def TensorRT_LLM_loader(model_name):
         raise ModuleNotFoundError("Failed to import 'tensorrt_llm'. Please install it manually following the instructions in the TensorRT-LLM GitHub repository.")
 
     model = TensorRTLLMModel.from_pretrained(model_name)
-    return model
+    return model, model.tokenizer
 
 
 def unload_model(keep_model_name=False):
@@ -124,7 +124,7 @@ def unload_model(keep_model_name=False):
     model_class_name = shared.model.__class__.__name__
     is_llamacpp = (model_class_name == 'LlamaServer')
 
-    if model_class_name in ['Exllamav3Model', 'Exllamav3HF']:
+    if model_class_name in ['Exllamav3Model', 'Exllamav3HF', 'TensorRTLLMModel']:
         shared.model.unload()
 
     shared.model = shared.tokenizer = None
diff --git a/modules/shared.py b/modules/shared.py
index ec6e23b9..4e212d1b 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -133,10 +133,6 @@ group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enab
 group.add_argument('--tp-backend', type=str, default='native', help='The backend for tensor parallelism. Valid options: native, nccl. Default: native.')
 group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
 
-# TensorRT-LLM
-group = parser.add_argument_group('TensorRT-LLM')
-group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner.')
-
 # RoPE
 group = parser.add_argument_group('RoPE')
 group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py
index 161dde82..ae061d06 100644
--- a/modules/tensorrt_llm.py
+++ b/modules/tensorrt_llm.py
@@ -1,15 +1,10 @@
 from pathlib import Path
 
-import tensorrt_llm
-import torch
-from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import SamplingParams
 
 from modules import shared
 from modules.logging_colors import logger
-from modules.text_generation import (
-    get_max_prompt_length,
-    get_reply_from_output_ids
-)
 
 
 class TensorRTLLMModel:
@@ -18,91 +13,50 @@ class TensorRTLLMModel:
 
     @classmethod
     def from_pretrained(cls, path_to_model):
-
         path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
-        runtime_rank = tensorrt_llm.mpi_rank()
 
-        # Define model settings
-        runner_kwargs = dict(
-            engine_dir=str(path_to_model),
-            lora_dir=None,
-            rank=runtime_rank,
-            debug_mode=False,
-            lora_ckpt_source="hf",
+        llm = LLM(
+            model=str(path_to_model),
+            skip_tokenizer_init=False,
         )
 
-        if shared.args.cpp_runner:
-            logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"")
-            runner_kwargs.update(
-                max_batch_size=1,
-                max_beam_width=1,
-            )
-        else:
-            logger.info("TensorRT-LLM: Using \"ModelRunner\"")
-
-        # Load the model
-        runner_cls = ModelRunnerCpp if shared.args.cpp_runner else ModelRunner
-        runner = runner_cls.from_dir(**runner_kwargs)
-
         result = cls()
-        result.model = runner
-        result.runtime_rank = runtime_rank
-
+        result.llm = llm
+        result.tokenizer = llm.tokenizer
         return result
 
     def generate_with_streaming(self, prompt, state):
-        batch_input_ids = []
-        input_ids = shared.tokenizer.encode(
-            prompt,
-            add_special_tokens=True,
-            truncation=False,
+        sampling_params = SamplingParams(
+            max_tokens=state['max_new_tokens'] if not state['auto_max_new_tokens']
+                       else state['truncation_length'] - len(shared.tokenizer.encode(prompt)),
+            end_id=shared.tokenizer.eos_token_id,
+            temperature=state['temperature'],
+            top_k=state['top_k'],
+            top_p=state['top_p'],
+            min_p=state['min_p'],
+            repetition_penalty=state['repetition_penalty'],
+            presence_penalty=state['presence_penalty'],
+            frequency_penalty=state['frequency_penalty'],
+            no_repeat_ngram_size=state['no_repeat_ngram_size'] if state['no_repeat_ngram_size'] > 0 else None,
+            seed=state['seed'],
+            ignore_eos=state['ban_eos_token'],
+            add_special_tokens=state['add_bos_token'],
+            skip_special_tokens=state['skip_special_tokens'],
         )
-        input_ids = torch.tensor(input_ids, dtype=torch.int32)
-        input_ids = input_ids[-get_max_prompt_length(state):]  # Apply truncation_length
-        batch_input_ids.append(input_ids)
 
-        if state['auto_max_new_tokens']:
-            max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
-        else:
-            max_new_tokens = state['max_new_tokens']
-
-        with torch.no_grad():
-            generator = self.model.generate(
-                batch_input_ids,
-                max_new_tokens=max_new_tokens,
-                end_id=shared.tokenizer.eos_token_id if not state['ban_eos_token'] else -1,
-                pad_id=shared.tokenizer.pad_token_id or shared.tokenizer.eos_token_id,
-                temperature=state['temperature'],
-                top_k=state['top_k'],
-                top_p=state['top_p'],
-                repetition_penalty=state['repetition_penalty'],
-                presence_penalty=state['presence_penalty'],
-                frequency_penalty=state['frequency_penalty'],
-                stop_words_list=None,
-                bad_words_list=None,
-                lora_uids=None,
-                prompt_table=None,
-                prompt_tasks=None,
-                streaming=True,
-                output_sequence_lengths=True,
-                return_dict=True,
-            )
-
-        torch.cuda.synchronize()
+        stop_event = state.get('stop_event')
+        result = self.llm.generate_async(prompt, sampling_params=sampling_params, streaming=True)
 
         cumulative_reply = ''
-        starting_from = batch_input_ids[0].shape[-1]
-
-        for curr_outputs in generator:
-            if shared.stop_everything:
+        for output in result:
+            if shared.stop_everything or (stop_event and stop_event.is_set()):
+                result.abort()
                 break
 
-            sequence_length = curr_outputs['sequence_lengths'][0].item()
-            output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist()
-
-            cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from)
-            starting_from = sequence_length
-            yield cumulative_reply
+            text_diff = output.outputs[0].text_diff
+            if text_diff:
+                cumulative_reply += text_diff
+                yield cumulative_reply
 
     def generate(self, prompt, state):
         output = ''
@@ -110,3 +64,8 @@ class TensorRTLLMModel:
             pass
 
         return output
+
+    def unload(self):
+        if hasattr(self, 'llm') and self.llm is not None:
+            self.llm.shutdown()
+            self.llm = None
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 6e0e67a1..02c1320c 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -25,7 +25,7 @@ def generate_reply(*args, **kwargs):
     state = args[1] if len(args) > 1 else kwargs.get('state', {})
     use_parallel = (
         state.get('stop_event') is not None
-        and shared.model.__class__.__name__ in ['Exllamav3Model', 'LlamaServer']
+        and shared.model.__class__.__name__ in ['Exllamav3Model', 'LlamaServer', 'TensorRTLLMModel']
         and (shared.model.__class__.__name__ != 'LlamaServer' or shared.args.parallel > 1)
     )
 
diff --git a/modules/ui.py b/modules/ui.py
index fd20e782..0cd2cf43 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -157,7 +157,6 @@ def list_model_elements():
         'enable_tp',
         'tp_backend',
         'cfg_cache',
-        'cpp_runner',
         'no_use_fast',
         'model_draft',
         'draft_max',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 12b5654c..0c2450d5 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -56,8 +56,11 @@ def create_ui():
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
                             shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).')
-                            shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
-                            shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
+                            shared.gradio['tensorrt_llm_info'] = gr.Markdown(
+                                '* TensorRT-LLM has to be installed manually: `pip install tensorrt_llm==1.1.0 --extra-index-url https://pypi.nvidia.com`.\n\n'
+                                '* You can load either a pre-built TensorRT engine or a regular HF model. '
+                                'HF models will be compiled to a TensorRT engine automatically on each load (this can take a while).'
+                            )
 
                             # Multimodal
                             with gr.Accordion("Multimodal (vision)", open=False, elem_classes='tgw-accordion') as shared.gradio['mmproj_accordion']: