From 268cc3f10031f6b6faecbc23daf327752d3c3591 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 5 Mar 2026 09:32:28 -0300 Subject: [PATCH] Update TensorRT-LLM to v1.1.0 --- docker/TensorRT-LLM/Dockerfile | 2 +- modules/shared.py | 2 +- modules/tensorrt_llm.py | 41 +++++++++------------------------- 3 files changed, 13 insertions(+), 32 deletions(-) diff --git a/docker/TensorRT-LLM/Dockerfile b/docker/TensorRT-LLM/Dockerfile index c2be3efe..3c989d79 100644 --- a/docker/TensorRT-LLM/Dockerfile +++ b/docker/TensorRT-LLM/Dockerfile @@ -18,7 +18,7 @@ RUN pip install -r requirements/full/requirements.txt ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH # Install TensorRT-LLM -RUN pip3 install tensorrt_llm==0.10.0 -U --pre --extra-index-url https://pypi.nvidia.com +RUN pip3 install tensorrt_llm==1.1.0 -U --extra-index-url https://pypi.nvidia.com # Expose the necessary port for the Python server EXPOSE 7860 5000 diff --git a/modules/shared.py b/modules/shared.py index c80db298..8d617ecd 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -142,7 +142,7 @@ group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', # TensorRT-LLM group = parser.add_argument_group('TensorRT-LLM') -group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.') +group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner.') # RoPE group = parser.add_argument_group('RoPE') diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py index 0527d493..161dde82 100644 --- a/modules/tensorrt_llm.py +++ b/modules/tensorrt_llm.py @@ -17,7 +17,7 @@ class TensorRTLLMModel: pass @classmethod - def from_pretrained(self, path_to_model): + def from_pretrained(cls, path_to_model): path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model) runtime_rank = tensorrt_llm.mpi_rank() @@ -35,11 +35,7 @@ class TensorRTLLMModel: logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"") runner_kwargs.update( max_batch_size=1, - max_input_len=shared.args.ctx_size - 512, - max_output_len=512, max_beam_width=1, - max_attention_window_size=None, - sink_token_length=None, ) else: logger.info("TensorRT-LLM: Using \"ModelRunner\"") @@ -48,7 +44,7 @@ class TensorRTLLMModel: runner_cls = ModelRunnerCpp if shared.args.cpp_runner else ModelRunner runner = runner_cls.from_dir(**runner_kwargs) - result = self() + result = cls() result.model = runner result.runtime_rank = runtime_rank @@ -65,9 +61,7 @@ class TensorRTLLMModel: input_ids = input_ids[-get_max_prompt_length(state):] # Apply truncation_length batch_input_ids.append(input_ids) - if shared.args.cpp_runner: - max_new_tokens = min(512, state['max_new_tokens']) - elif state['auto_max_new_tokens']: + if state['auto_max_new_tokens']: max_new_tokens = state['truncation_length'] - input_ids.shape[-1] else: max_new_tokens = state['max_new_tokens'] @@ -76,27 +70,22 @@ class TensorRTLLMModel: generator = self.model.generate( batch_input_ids, max_new_tokens=max_new_tokens, - max_attention_window_size=None, - sink_token_length=None, end_id=shared.tokenizer.eos_token_id if not state['ban_eos_token'] else -1, pad_id=shared.tokenizer.pad_token_id or shared.tokenizer.eos_token_id, temperature=state['temperature'], top_k=state['top_k'], top_p=state['top_p'], - num_beams=1, - length_penalty=1.0, repetition_penalty=state['repetition_penalty'], presence_penalty=state['presence_penalty'], frequency_penalty=state['frequency_penalty'], stop_words_list=None, bad_words_list=None, lora_uids=None, - prompt_table_path=None, + prompt_table=None, prompt_tasks=None, - streaming=not shared.args.cpp_runner, + streaming=True, output_sequence_lengths=True, return_dict=True, - medusa_choices=None ) torch.cuda.synchronize() @@ -104,24 +93,16 @@ class TensorRTLLMModel: cumulative_reply = '' starting_from = batch_input_ids[0].shape[-1] - if shared.args.cpp_runner: - sequence_length = generator['sequence_lengths'][0].item() - output_ids = generator['output_ids'][0][0][:sequence_length].tolist() + for curr_outputs in generator: + if shared.stop_everything: + break + + sequence_length = curr_outputs['sequence_lengths'][0].item() + output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist() cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from) starting_from = sequence_length yield cumulative_reply - else: - for curr_outputs in generator: - if shared.stop_everything: - break - - sequence_length = curr_outputs['sequence_lengths'][0].item() - output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist() - - cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from) - starting_from = sequence_length - yield cumulative_reply def generate(self, prompt, state): output = ''