mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-03-08 06:33:51 +01:00
TensorRT-LLM: Migrate from ModelRunner to LLM API, add concurrent API request support
This commit is contained in:
parent
9824c82cb6
commit
f52d9336e5
|
|
@ -77,7 +77,6 @@ loaders_and_params = OrderedDict({
|
|||
],
|
||||
'TensorRT-LLM': [
|
||||
'ctx_size',
|
||||
'cpp_runner',
|
||||
'tensorrt_llm_info',
|
||||
]
|
||||
})
|
||||
|
|
@ -252,11 +251,16 @@ loaders_samplers = {
|
|||
'temperature',
|
||||
'top_p',
|
||||
'top_k',
|
||||
'min_p',
|
||||
'repetition_penalty',
|
||||
'frequency_penalty',
|
||||
'presence_penalty',
|
||||
'no_repeat_ngram_size',
|
||||
'auto_max_new_tokens',
|
||||
'ban_eos_token',
|
||||
'add_bos_token',
|
||||
'skip_special_tokens',
|
||||
'seed',
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ def TensorRT_LLM_loader(model_name):
|
|||
raise ModuleNotFoundError("Failed to import 'tensorrt_llm'. Please install it manually following the instructions in the TensorRT-LLM GitHub repository.")
|
||||
|
||||
model = TensorRTLLMModel.from_pretrained(model_name)
|
||||
return model
|
||||
return model, model.tokenizer
|
||||
|
||||
|
||||
def unload_model(keep_model_name=False):
|
||||
|
|
@ -124,7 +124,7 @@ def unload_model(keep_model_name=False):
|
|||
model_class_name = shared.model.__class__.__name__
|
||||
is_llamacpp = (model_class_name == 'LlamaServer')
|
||||
|
||||
if model_class_name in ['Exllamav3Model', 'Exllamav3HF']:
|
||||
if model_class_name in ['Exllamav3Model', 'Exllamav3HF', 'TensorRTLLMModel']:
|
||||
shared.model.unload()
|
||||
|
||||
shared.model = shared.tokenizer = None
|
||||
|
|
|
|||
|
|
@ -133,10 +133,6 @@ group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enab
|
|||
group.add_argument('--tp-backend', type=str, default='native', help='The backend for tensor parallelism. Valid options: native, nccl. Default: native.')
|
||||
group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
|
||||
|
||||
# TensorRT-LLM
|
||||
group = parser.add_argument_group('TensorRT-LLM')
|
||||
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner.')
|
||||
|
||||
# RoPE
|
||||
group = parser.add_argument_group('RoPE')
|
||||
group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
|
||||
|
|
|
|||
|
|
@ -1,15 +1,10 @@
|
|||
from pathlib import Path
|
||||
|
||||
import tensorrt_llm
|
||||
import torch
|
||||
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import SamplingParams
|
||||
|
||||
from modules import shared
|
||||
from modules.logging_colors import logger
|
||||
from modules.text_generation import (
|
||||
get_max_prompt_length,
|
||||
get_reply_from_output_ids
|
||||
)
|
||||
|
||||
|
||||
class TensorRTLLMModel:
|
||||
|
|
@ -18,91 +13,50 @@ class TensorRTLLMModel:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, path_to_model):
|
||||
|
||||
path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
|
||||
runtime_rank = tensorrt_llm.mpi_rank()
|
||||
|
||||
# Define model settings
|
||||
runner_kwargs = dict(
|
||||
engine_dir=str(path_to_model),
|
||||
lora_dir=None,
|
||||
rank=runtime_rank,
|
||||
debug_mode=False,
|
||||
lora_ckpt_source="hf",
|
||||
llm = LLM(
|
||||
model=str(path_to_model),
|
||||
skip_tokenizer_init=False,
|
||||
)
|
||||
|
||||
if shared.args.cpp_runner:
|
||||
logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"")
|
||||
runner_kwargs.update(
|
||||
max_batch_size=1,
|
||||
max_beam_width=1,
|
||||
)
|
||||
else:
|
||||
logger.info("TensorRT-LLM: Using \"ModelRunner\"")
|
||||
|
||||
# Load the model
|
||||
runner_cls = ModelRunnerCpp if shared.args.cpp_runner else ModelRunner
|
||||
runner = runner_cls.from_dir(**runner_kwargs)
|
||||
|
||||
result = cls()
|
||||
result.model = runner
|
||||
result.runtime_rank = runtime_rank
|
||||
|
||||
result.llm = llm
|
||||
result.tokenizer = llm.tokenizer
|
||||
return result
|
||||
|
||||
def generate_with_streaming(self, prompt, state):
|
||||
batch_input_ids = []
|
||||
input_ids = shared.tokenizer.encode(
|
||||
prompt,
|
||||
add_special_tokens=True,
|
||||
truncation=False,
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=state['max_new_tokens'] if not state['auto_max_new_tokens']
|
||||
else state['truncation_length'] - len(shared.tokenizer.encode(prompt)),
|
||||
end_id=shared.tokenizer.eos_token_id,
|
||||
temperature=state['temperature'],
|
||||
top_k=state['top_k'],
|
||||
top_p=state['top_p'],
|
||||
min_p=state['min_p'],
|
||||
repetition_penalty=state['repetition_penalty'],
|
||||
presence_penalty=state['presence_penalty'],
|
||||
frequency_penalty=state['frequency_penalty'],
|
||||
no_repeat_ngram_size=state['no_repeat_ngram_size'] if state['no_repeat_ngram_size'] > 0 else None,
|
||||
seed=state['seed'],
|
||||
ignore_eos=state['ban_eos_token'],
|
||||
add_special_tokens=state['add_bos_token'],
|
||||
skip_special_tokens=state['skip_special_tokens'],
|
||||
)
|
||||
input_ids = torch.tensor(input_ids, dtype=torch.int32)
|
||||
input_ids = input_ids[-get_max_prompt_length(state):] # Apply truncation_length
|
||||
batch_input_ids.append(input_ids)
|
||||
|
||||
if state['auto_max_new_tokens']:
|
||||
max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
|
||||
else:
|
||||
max_new_tokens = state['max_new_tokens']
|
||||
|
||||
with torch.no_grad():
|
||||
generator = self.model.generate(
|
||||
batch_input_ids,
|
||||
max_new_tokens=max_new_tokens,
|
||||
end_id=shared.tokenizer.eos_token_id if not state['ban_eos_token'] else -1,
|
||||
pad_id=shared.tokenizer.pad_token_id or shared.tokenizer.eos_token_id,
|
||||
temperature=state['temperature'],
|
||||
top_k=state['top_k'],
|
||||
top_p=state['top_p'],
|
||||
repetition_penalty=state['repetition_penalty'],
|
||||
presence_penalty=state['presence_penalty'],
|
||||
frequency_penalty=state['frequency_penalty'],
|
||||
stop_words_list=None,
|
||||
bad_words_list=None,
|
||||
lora_uids=None,
|
||||
prompt_table=None,
|
||||
prompt_tasks=None,
|
||||
streaming=True,
|
||||
output_sequence_lengths=True,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
stop_event = state.get('stop_event')
|
||||
result = self.llm.generate_async(prompt, sampling_params=sampling_params, streaming=True)
|
||||
|
||||
cumulative_reply = ''
|
||||
starting_from = batch_input_ids[0].shape[-1]
|
||||
|
||||
for curr_outputs in generator:
|
||||
if shared.stop_everything:
|
||||
for output in result:
|
||||
if shared.stop_everything or (stop_event and stop_event.is_set()):
|
||||
result.abort()
|
||||
break
|
||||
|
||||
sequence_length = curr_outputs['sequence_lengths'][0].item()
|
||||
output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist()
|
||||
|
||||
cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from)
|
||||
starting_from = sequence_length
|
||||
yield cumulative_reply
|
||||
text_diff = output.outputs[0].text_diff
|
||||
if text_diff:
|
||||
cumulative_reply += text_diff
|
||||
yield cumulative_reply
|
||||
|
||||
def generate(self, prompt, state):
|
||||
output = ''
|
||||
|
|
@ -110,3 +64,8 @@ class TensorRTLLMModel:
|
|||
pass
|
||||
|
||||
return output
|
||||
|
||||
def unload(self):
|
||||
if hasattr(self, 'llm') and self.llm is not None:
|
||||
self.llm.shutdown()
|
||||
self.llm = None
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ def generate_reply(*args, **kwargs):
|
|||
state = args[1] if len(args) > 1 else kwargs.get('state', {})
|
||||
use_parallel = (
|
||||
state.get('stop_event') is not None
|
||||
and shared.model.__class__.__name__ in ['Exllamav3Model', 'LlamaServer']
|
||||
and shared.model.__class__.__name__ in ['Exllamav3Model', 'LlamaServer', 'TensorRTLLMModel']
|
||||
and (shared.model.__class__.__name__ != 'LlamaServer' or shared.args.parallel > 1)
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -157,7 +157,6 @@ def list_model_elements():
|
|||
'enable_tp',
|
||||
'tp_backend',
|
||||
'cfg_cache',
|
||||
'cpp_runner',
|
||||
'no_use_fast',
|
||||
'model_draft',
|
||||
'draft_max',
|
||||
|
|
|
|||
|
|
@ -56,8 +56,11 @@ def create_ui():
|
|||
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
|
||||
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
|
||||
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).')
|
||||
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
|
||||
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
|
||||
shared.gradio['tensorrt_llm_info'] = gr.Markdown(
|
||||
'* TensorRT-LLM has to be installed manually: `pip install tensorrt_llm==1.1.0 --extra-index-url https://pypi.nvidia.com`.\n\n'
|
||||
'* You can load either a pre-built TensorRT engine or a regular HF model. '
|
||||
'HF models will be compiled to a TensorRT engine automatically on each load (this can take a while).'
|
||||
)
|
||||
|
||||
# Multimodal
|
||||
with gr.Accordion("Multimodal (vision)", open=False, elem_classes='tgw-accordion') as shared.gradio['mmproj_accordion']:
|
||||
|
|
|
|||
Loading…
Reference in a new issue