mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-03-06 21:53:50 +01:00
Update TensorRT-LLM to v1.1.0
This commit is contained in:
parent
69fa4dd0b1
commit
268cc3f100
|
|
@ -18,7 +18,7 @@ RUN pip install -r requirements/full/requirements.txt
|
|||
ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
|
||||
|
||||
# Install TensorRT-LLM
|
||||
RUN pip3 install tensorrt_llm==0.10.0 -U --pre --extra-index-url https://pypi.nvidia.com
|
||||
RUN pip3 install tensorrt_llm==1.1.0 -U --extra-index-url https://pypi.nvidia.com
|
||||
|
||||
# Expose the necessary port for the Python server
|
||||
EXPOSE 7860 5000
|
||||
|
|
|
|||
|
|
@ -142,7 +142,7 @@ group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N',
|
|||
|
||||
# TensorRT-LLM
|
||||
group = parser.add_argument_group('TensorRT-LLM')
|
||||
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
|
||||
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner.')
|
||||
|
||||
# RoPE
|
||||
group = parser.add_argument_group('RoPE')
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ class TensorRTLLMModel:
|
|||
pass
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, path_to_model):
|
||||
def from_pretrained(cls, path_to_model):
|
||||
|
||||
path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
|
||||
runtime_rank = tensorrt_llm.mpi_rank()
|
||||
|
|
@ -35,11 +35,7 @@ class TensorRTLLMModel:
|
|||
logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"")
|
||||
runner_kwargs.update(
|
||||
max_batch_size=1,
|
||||
max_input_len=shared.args.ctx_size - 512,
|
||||
max_output_len=512,
|
||||
max_beam_width=1,
|
||||
max_attention_window_size=None,
|
||||
sink_token_length=None,
|
||||
)
|
||||
else:
|
||||
logger.info("TensorRT-LLM: Using \"ModelRunner\"")
|
||||
|
|
@ -48,7 +44,7 @@ class TensorRTLLMModel:
|
|||
runner_cls = ModelRunnerCpp if shared.args.cpp_runner else ModelRunner
|
||||
runner = runner_cls.from_dir(**runner_kwargs)
|
||||
|
||||
result = self()
|
||||
result = cls()
|
||||
result.model = runner
|
||||
result.runtime_rank = runtime_rank
|
||||
|
||||
|
|
@ -65,9 +61,7 @@ class TensorRTLLMModel:
|
|||
input_ids = input_ids[-get_max_prompt_length(state):] # Apply truncation_length
|
||||
batch_input_ids.append(input_ids)
|
||||
|
||||
if shared.args.cpp_runner:
|
||||
max_new_tokens = min(512, state['max_new_tokens'])
|
||||
elif state['auto_max_new_tokens']:
|
||||
if state['auto_max_new_tokens']:
|
||||
max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
|
||||
else:
|
||||
max_new_tokens = state['max_new_tokens']
|
||||
|
|
@ -76,27 +70,22 @@ class TensorRTLLMModel:
|
|||
generator = self.model.generate(
|
||||
batch_input_ids,
|
||||
max_new_tokens=max_new_tokens,
|
||||
max_attention_window_size=None,
|
||||
sink_token_length=None,
|
||||
end_id=shared.tokenizer.eos_token_id if not state['ban_eos_token'] else -1,
|
||||
pad_id=shared.tokenizer.pad_token_id or shared.tokenizer.eos_token_id,
|
||||
temperature=state['temperature'],
|
||||
top_k=state['top_k'],
|
||||
top_p=state['top_p'],
|
||||
num_beams=1,
|
||||
length_penalty=1.0,
|
||||
repetition_penalty=state['repetition_penalty'],
|
||||
presence_penalty=state['presence_penalty'],
|
||||
frequency_penalty=state['frequency_penalty'],
|
||||
stop_words_list=None,
|
||||
bad_words_list=None,
|
||||
lora_uids=None,
|
||||
prompt_table_path=None,
|
||||
prompt_table=None,
|
||||
prompt_tasks=None,
|
||||
streaming=not shared.args.cpp_runner,
|
||||
streaming=True,
|
||||
output_sequence_lengths=True,
|
||||
return_dict=True,
|
||||
medusa_choices=None
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
|
@ -104,24 +93,16 @@ class TensorRTLLMModel:
|
|||
cumulative_reply = ''
|
||||
starting_from = batch_input_ids[0].shape[-1]
|
||||
|
||||
if shared.args.cpp_runner:
|
||||
sequence_length = generator['sequence_lengths'][0].item()
|
||||
output_ids = generator['output_ids'][0][0][:sequence_length].tolist()
|
||||
for curr_outputs in generator:
|
||||
if shared.stop_everything:
|
||||
break
|
||||
|
||||
sequence_length = curr_outputs['sequence_lengths'][0].item()
|
||||
output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist()
|
||||
|
||||
cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from)
|
||||
starting_from = sequence_length
|
||||
yield cumulative_reply
|
||||
else:
|
||||
for curr_outputs in generator:
|
||||
if shared.stop_everything:
|
||||
break
|
||||
|
||||
sequence_length = curr_outputs['sequence_lengths'][0].item()
|
||||
output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist()
|
||||
|
||||
cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from)
|
||||
starting_from = sequence_length
|
||||
yield cumulative_reply
|
||||
|
||||
def generate(self, prompt, state):
|
||||
output = ''
|
||||
|
|
|
|||
Loading…
Reference in a new issue