diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index 1254ff5d..d9f4ed57 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -74,6 +74,11 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): split = [float(alloc) for alloc in shared.args.gpu_split.split(",")] load_params['use_per_device'] = split + # Tensor-parallelism + if shared.args.enable_tp: + load_params['tensor_p'] = True + load_params['tp_backend'] = shared.args.tp_backend + self.ex_model.load(**load_params) self.past_seq = None self.max_tokens = max_tokens diff --git a/modules/loaders.py b/modules/loaders.py index 295db1e7..f88e976d 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -56,6 +56,8 @@ loaders_and_params = OrderedDict({ 'cfg_cache', 'trust_remote_code', 'no_use_fast', + 'enable_tp', + 'tp_backend', ], 'ExLlamav3': [ 'ctx_size',