mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-04-04 22:27:29 +00:00
Remove the AutoGPTQ loader (#6641)
This commit is contained in:
parent
d3adcbf64b
commit
7157257c3f
10 changed files with 19 additions and 228 deletions
|
|
@ -86,7 +86,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
|
|||
|
||||
# Model loader
|
||||
group = parser.add_argument_group('Model loader')
|
||||
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')
|
||||
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2.')
|
||||
|
||||
# Transformers/Accelerate
|
||||
group = parser.add_argument_group('Transformers/Accelerate')
|
||||
|
|
@ -147,17 +147,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
|
|||
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
|
||||
group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
|
||||
|
||||
# AutoGPTQ
|
||||
group = parser.add_argument_group('AutoGPTQ')
|
||||
group.add_argument('--triton', action='store_true', help='Use triton.')
|
||||
group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
|
||||
group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
|
||||
group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
|
||||
group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
|
||||
group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
|
||||
group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
|
||||
group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
|
||||
|
||||
# HQQ
|
||||
group = parser.add_argument_group('HQQ')
|
||||
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
|
||||
|
|
@ -220,6 +209,14 @@ group.add_argument('--no_inject_fused_attention', action='store_true', help='DEP
|
|||
group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED')
|
||||
group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED')
|
||||
group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED')
|
||||
group.add_argument('--triton', action='store_true', help='DEPRECATED')
|
||||
group.add_argument('--no_inject_fused_mlp', action='store_true', help='DEPRECATED')
|
||||
group.add_argument('--no_use_cuda_fp16', action='store_true', help='DEPRECATED')
|
||||
group.add_argument('--desc_act', action='store_true', help='DEPRECATED')
|
||||
group.add_argument('--disable_exllama', action='store_true', help='DEPRECATED')
|
||||
group.add_argument('--disable_exllamav2', action='store_true', help='DEPRECATED')
|
||||
group.add_argument('--wbits', type=int, default=0, help='DEPRECATED')
|
||||
group.add_argument('--groupsize', type=int, default=-1, help='DEPRECATED')
|
||||
|
||||
args = parser.parse_args()
|
||||
args_defaults = parser.parse_args([])
|
||||
|
|
@ -262,8 +259,6 @@ def fix_loader_name(name):
|
|||
return 'llamacpp_HF'
|
||||
elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
|
||||
return 'Transformers'
|
||||
elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']:
|
||||
return 'AutoGPTQ'
|
||||
elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
|
||||
return 'ExLlama'
|
||||
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue