Update TensorRT-LLM to v1.1.0

This commit is contained in:
oobabooga 2026-03-05 09:32:28 -03:00
parent 69fa4dd0b1
commit 268cc3f100
3 changed files with 13 additions and 32 deletions

View file

@ -18,7 +18,7 @@ RUN pip install -r requirements/full/requirements.txt
ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
# Install TensorRT-LLM
RUN pip3 install tensorrt_llm==0.10.0 -U --pre --extra-index-url https://pypi.nvidia.com
RUN pip3 install tensorrt_llm==1.1.0 -U --extra-index-url https://pypi.nvidia.com
# Expose the necessary port for the Python server
EXPOSE 7860 5000

View file

@ -142,7 +142,7 @@ group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N',
# TensorRT-LLM
group = parser.add_argument_group('TensorRT-LLM')
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner.')
# RoPE
group = parser.add_argument_group('RoPE')

View file

@ -17,7 +17,7 @@ class TensorRTLLMModel:
pass
@classmethod
def from_pretrained(self, path_to_model):
def from_pretrained(cls, path_to_model):
path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
runtime_rank = tensorrt_llm.mpi_rank()
@ -35,11 +35,7 @@ class TensorRTLLMModel:
logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"")
runner_kwargs.update(
max_batch_size=1,
max_input_len=shared.args.ctx_size - 512,
max_output_len=512,
max_beam_width=1,
max_attention_window_size=None,
sink_token_length=None,
)
else:
logger.info("TensorRT-LLM: Using \"ModelRunner\"")
@ -48,7 +44,7 @@ class TensorRTLLMModel:
runner_cls = ModelRunnerCpp if shared.args.cpp_runner else ModelRunner
runner = runner_cls.from_dir(**runner_kwargs)
result = self()
result = cls()
result.model = runner
result.runtime_rank = runtime_rank
@ -65,9 +61,7 @@ class TensorRTLLMModel:
input_ids = input_ids[-get_max_prompt_length(state):] # Apply truncation_length
batch_input_ids.append(input_ids)
if shared.args.cpp_runner:
max_new_tokens = min(512, state['max_new_tokens'])
elif state['auto_max_new_tokens']:
if state['auto_max_new_tokens']:
max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
else:
max_new_tokens = state['max_new_tokens']
@ -76,27 +70,22 @@ class TensorRTLLMModel:
generator = self.model.generate(
batch_input_ids,
max_new_tokens=max_new_tokens,
max_attention_window_size=None,
sink_token_length=None,
end_id=shared.tokenizer.eos_token_id if not state['ban_eos_token'] else -1,
pad_id=shared.tokenizer.pad_token_id or shared.tokenizer.eos_token_id,
temperature=state['temperature'],
top_k=state['top_k'],
top_p=state['top_p'],
num_beams=1,
length_penalty=1.0,
repetition_penalty=state['repetition_penalty'],
presence_penalty=state['presence_penalty'],
frequency_penalty=state['frequency_penalty'],
stop_words_list=None,
bad_words_list=None,
lora_uids=None,
prompt_table_path=None,
prompt_table=None,
prompt_tasks=None,
streaming=not shared.args.cpp_runner,
streaming=True,
output_sequence_lengths=True,
return_dict=True,
medusa_choices=None
)
torch.cuda.synchronize()
@ -104,24 +93,16 @@ class TensorRTLLMModel:
cumulative_reply = ''
starting_from = batch_input_ids[0].shape[-1]
if shared.args.cpp_runner:
sequence_length = generator['sequence_lengths'][0].item()
output_ids = generator['output_ids'][0][0][:sequence_length].tolist()
for curr_outputs in generator:
if shared.stop_everything:
break
sequence_length = curr_outputs['sequence_lengths'][0].item()
output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist()
cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from)
starting_from = sequence_length
yield cumulative_reply
else:
for curr_outputs in generator:
if shared.stop_everything:
break
sequence_length = curr_outputs['sequence_lengths'][0].item()
output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist()
cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from)
starting_from = sequence_length
yield cumulative_reply
def generate(self, prompt, state):
output = ''