mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-04-07 23:53:40 +00:00
TensorRT-LLM: Migrate from ModelRunner to LLM API, add concurrent API request support
This commit is contained in:
parent
9824c82cb6
commit
f52d9336e5
7 changed files with 50 additions and 89 deletions
|
|
@ -56,8 +56,11 @@ def create_ui():
|
|||
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
|
||||
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
|
||||
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).')
|
||||
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
|
||||
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
|
||||
shared.gradio['tensorrt_llm_info'] = gr.Markdown(
|
||||
'* TensorRT-LLM has to be installed manually: `pip install tensorrt_llm==1.1.0 --extra-index-url https://pypi.nvidia.com`.\n\n'
|
||||
'* You can load either a pre-built TensorRT engine or a regular HF model. '
|
||||
'HF models will be compiled to a TensorRT engine automatically on each load (this can take a while).'
|
||||
)
|
||||
|
||||
# Multimodal
|
||||
with gr.Accordion("Multimodal (vision)", open=False, elem_classes='tgw-accordion') as shared.gradio['mmproj_accordion']:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue