ExLlamaV3: Enable the --enable-tp option, add a --tp-backend option

2025-12-06 07:12:10 +01:00 · 2025-08-17 13:19:11 -07:00 · 2025-08-17 13:19:11 -07:00 · dbabe67e77
parent d771ca4a13
commit dbabe67e77
5 changed files with 16 additions and 2 deletions
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@ -91,6 +91,11 @@ class Exllamav3Model:
            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
            load_params['use_per_device'] = split

+        # Tensor-parallelism
+        if shared.args.enable_tp:
+            load_params['tensor_p'] = True
+            load_params['tp_backend'] = shared.args.tp_backend
+
        model.load(**load_params)
        tokenizer = Tokenizer.from_config(config)

--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -65,6 +65,8 @@ loaders_and_params = OrderedDict({
        'draft_max',
        'ctx_size_draft',
        'speculative_decoding_accordion',
+        'enable_tp',
+        'tp_backend',
    ],
    'ExLlamav2_HF': [
        'ctx_size',
--- a/modules/shared.py
+++ b/modules/shared.py
@ -101,6 +101,11 @@ group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of
 group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
 group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.')

+# ExLlamaV3
+group = parser.add_argument_group('ExLlamaV3')
+group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) to split the model across GPUs.')
+group.add_argument('--tp-backend', type=str, default='native', help='The backend for tensor parallelism. Valid options: native, nccl. Default: native.')
+
 # ExLlamaV2
 group = parser.add_argument_group('ExLlamaV2')
 group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
@ -110,7 +115,6 @@ group.add_argument('--no_flash_attn', action='store_true', help='Force flash-att
 group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
 group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
 group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
-group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')

 # TensorRT-LLM
 group = parser.add_argument_group('TensorRT-LLM')
--- a/modules/ui.py
+++ b/modules/ui.py
@ -155,6 +155,7 @@ def list_model_elements():
        'bf16',
        'autosplit',
        'enable_tp',
+        'tp_backend',
        'no_flash_attn',
        'no_xformers',
        'no_sdpa',
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -46,6 +46,8 @@ def create_ui():
                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                            shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
                            shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
+                            shared.gradio['tp_backend'] = gr.Dropdown(label="tp-backend", choices=['native', 'nccl'], value=shared.args.tp_backend, info='The backend for tensor parallelism.')
+
                        with gr.Column():
                            shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
@ -54,7 +56,7 @@ def create_ui():
                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
                            shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
-                            shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
+                            shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).')
                            shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
                            shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')