From f52d9336e53cd3149dc76e95c02925498915eabd Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 5 Mar 2026 18:09:45 -0800 Subject: [PATCH] TensorRT-LLM: Migrate from ModelRunner to LLM API, add concurrent API request support --- modules/loaders.py | 6 +- modules/models.py | 4 +- modules/shared.py | 4 -- modules/tensorrt_llm.py | 115 ++++++++++++------------------------- modules/text_generation.py | 2 +- modules/ui.py | 1 - modules/ui_model_menu.py | 7 ++- 7 files changed, 50 insertions(+), 89 deletions(-) diff --git a/modules/loaders.py b/modules/loaders.py index 0348c939..15b8dfeb 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -77,7 +77,6 @@ loaders_and_params = OrderedDict({ ], 'TensorRT-LLM': [ 'ctx_size', - 'cpp_runner', 'tensorrt_llm_info', ] }) @@ -252,11 +251,16 @@ loaders_samplers = { 'temperature', 'top_p', 'top_k', + 'min_p', 'repetition_penalty', 'frequency_penalty', 'presence_penalty', + 'no_repeat_ngram_size', 'auto_max_new_tokens', 'ban_eos_token', + 'add_bos_token', + 'skip_special_tokens', + 'seed', } } diff --git a/modules/models.py b/modules/models.py index 9780b9f5..bc5585cf 100644 --- a/modules/models.py +++ b/modules/models.py @@ -114,7 +114,7 @@ def TensorRT_LLM_loader(model_name): raise ModuleNotFoundError("Failed to import 'tensorrt_llm'. Please install it manually following the instructions in the TensorRT-LLM GitHub repository.") model = TensorRTLLMModel.from_pretrained(model_name) - return model + return model, model.tokenizer def unload_model(keep_model_name=False): @@ -124,7 +124,7 @@ def unload_model(keep_model_name=False): model_class_name = shared.model.__class__.__name__ is_llamacpp = (model_class_name == 'LlamaServer') - if model_class_name in ['Exllamav3Model', 'Exllamav3HF']: + if model_class_name in ['Exllamav3Model', 'Exllamav3HF', 'TensorRTLLMModel']: shared.model.unload() shared.model = shared.tokenizer = None diff --git a/modules/shared.py b/modules/shared.py index ec6e23b9..4e212d1b 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -133,10 +133,6 @@ group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enab group.add_argument('--tp-backend', type=str, default='native', help='The backend for tensor parallelism. Valid options: native, nccl. Default: native.') group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') -# TensorRT-LLM -group = parser.add_argument_group('TensorRT-LLM') -group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner.') - # RoPE group = parser.add_argument_group('RoPE') group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.') diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py index 161dde82..ae061d06 100644 --- a/modules/tensorrt_llm.py +++ b/modules/tensorrt_llm.py @@ -1,15 +1,10 @@ from pathlib import Path -import tensorrt_llm -import torch -from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp +from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm.llmapi import SamplingParams from modules import shared from modules.logging_colors import logger -from modules.text_generation import ( - get_max_prompt_length, - get_reply_from_output_ids -) class TensorRTLLMModel: @@ -18,91 +13,50 @@ class TensorRTLLMModel: @classmethod def from_pretrained(cls, path_to_model): - path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model) - runtime_rank = tensorrt_llm.mpi_rank() - # Define model settings - runner_kwargs = dict( - engine_dir=str(path_to_model), - lora_dir=None, - rank=runtime_rank, - debug_mode=False, - lora_ckpt_source="hf", + llm = LLM( + model=str(path_to_model), + skip_tokenizer_init=False, ) - if shared.args.cpp_runner: - logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"") - runner_kwargs.update( - max_batch_size=1, - max_beam_width=1, - ) - else: - logger.info("TensorRT-LLM: Using \"ModelRunner\"") - - # Load the model - runner_cls = ModelRunnerCpp if shared.args.cpp_runner else ModelRunner - runner = runner_cls.from_dir(**runner_kwargs) - result = cls() - result.model = runner - result.runtime_rank = runtime_rank - + result.llm = llm + result.tokenizer = llm.tokenizer return result def generate_with_streaming(self, prompt, state): - batch_input_ids = [] - input_ids = shared.tokenizer.encode( - prompt, - add_special_tokens=True, - truncation=False, + sampling_params = SamplingParams( + max_tokens=state['max_new_tokens'] if not state['auto_max_new_tokens'] + else state['truncation_length'] - len(shared.tokenizer.encode(prompt)), + end_id=shared.tokenizer.eos_token_id, + temperature=state['temperature'], + top_k=state['top_k'], + top_p=state['top_p'], + min_p=state['min_p'], + repetition_penalty=state['repetition_penalty'], + presence_penalty=state['presence_penalty'], + frequency_penalty=state['frequency_penalty'], + no_repeat_ngram_size=state['no_repeat_ngram_size'] if state['no_repeat_ngram_size'] > 0 else None, + seed=state['seed'], + ignore_eos=state['ban_eos_token'], + add_special_tokens=state['add_bos_token'], + skip_special_tokens=state['skip_special_tokens'], ) - input_ids = torch.tensor(input_ids, dtype=torch.int32) - input_ids = input_ids[-get_max_prompt_length(state):] # Apply truncation_length - batch_input_ids.append(input_ids) - if state['auto_max_new_tokens']: - max_new_tokens = state['truncation_length'] - input_ids.shape[-1] - else: - max_new_tokens = state['max_new_tokens'] - - with torch.no_grad(): - generator = self.model.generate( - batch_input_ids, - max_new_tokens=max_new_tokens, - end_id=shared.tokenizer.eos_token_id if not state['ban_eos_token'] else -1, - pad_id=shared.tokenizer.pad_token_id or shared.tokenizer.eos_token_id, - temperature=state['temperature'], - top_k=state['top_k'], - top_p=state['top_p'], - repetition_penalty=state['repetition_penalty'], - presence_penalty=state['presence_penalty'], - frequency_penalty=state['frequency_penalty'], - stop_words_list=None, - bad_words_list=None, - lora_uids=None, - prompt_table=None, - prompt_tasks=None, - streaming=True, - output_sequence_lengths=True, - return_dict=True, - ) - - torch.cuda.synchronize() + stop_event = state.get('stop_event') + result = self.llm.generate_async(prompt, sampling_params=sampling_params, streaming=True) cumulative_reply = '' - starting_from = batch_input_ids[0].shape[-1] - - for curr_outputs in generator: - if shared.stop_everything: + for output in result: + if shared.stop_everything or (stop_event and stop_event.is_set()): + result.abort() break - sequence_length = curr_outputs['sequence_lengths'][0].item() - output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist() - - cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from) - starting_from = sequence_length - yield cumulative_reply + text_diff = output.outputs[0].text_diff + if text_diff: + cumulative_reply += text_diff + yield cumulative_reply def generate(self, prompt, state): output = '' @@ -110,3 +64,8 @@ class TensorRTLLMModel: pass return output + + def unload(self): + if hasattr(self, 'llm') and self.llm is not None: + self.llm.shutdown() + self.llm = None diff --git a/modules/text_generation.py b/modules/text_generation.py index 6e0e67a1..02c1320c 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -25,7 +25,7 @@ def generate_reply(*args, **kwargs): state = args[1] if len(args) > 1 else kwargs.get('state', {}) use_parallel = ( state.get('stop_event') is not None - and shared.model.__class__.__name__ in ['Exllamav3Model', 'LlamaServer'] + and shared.model.__class__.__name__ in ['Exllamav3Model', 'LlamaServer', 'TensorRTLLMModel'] and (shared.model.__class__.__name__ != 'LlamaServer' or shared.args.parallel > 1) ) diff --git a/modules/ui.py b/modules/ui.py index fd20e782..0cd2cf43 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -157,7 +157,6 @@ def list_model_elements(): 'enable_tp', 'tp_backend', 'cfg_cache', - 'cpp_runner', 'no_use_fast', 'model_draft', 'draft_max', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 12b5654c..0c2450d5 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -56,8 +56,11 @@ def create_ui(): shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.') shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).') - shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.') - shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') + shared.gradio['tensorrt_llm_info'] = gr.Markdown( + '* TensorRT-LLM has to be installed manually: `pip install tensorrt_llm==1.1.0 --extra-index-url https://pypi.nvidia.com`.\n\n' + '* You can load either a pre-built TensorRT engine or a regular HF model. ' + 'HF models will be compiled to a TensorRT engine automatically on each load (this can take a while).' + ) # Multimodal with gr.Accordion("Multimodal (vision)", open=False, elem_classes='tgw-accordion') as shared.gradio['mmproj_accordion']: