diff --git a/modules/loaders.py b/modules/loaders.py index 980a13e6..2fbba588 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -110,7 +110,6 @@ loaders_and_params = OrderedDict({ 'no_flash_attn', 'no_xformers', 'no_sdpa', - 'exllamav2_info', ], 'HQQ': [ 'hqq_backend', diff --git a/modules/models_settings.py b/modules/models_settings.py index b83544d4..b50840d7 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -172,11 +172,11 @@ def infer_loader(model_name, model_settings, hf_quant_method=None): elif hf_quant_method == 'exl3': loader = 'ExLlamav3_HF' elif hf_quant_method in ['exl2', 'gptq']: - loader = 'ExLlamav2_HF' + loader = 'ExLlamav2' elif re.match(r'.*exl3', model_name.lower()): loader = 'ExLlamav3_HF' elif re.match(r'.*exl2', model_name.lower()): - loader = 'ExLlamav2_HF' + loader = 'ExLlamav2' elif re.match(r'.*-hqq', model_name.lower()): return 'HQQ' else: diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 4fc1de08..141869fa 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -133,7 +133,6 @@ def create_ui(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.") - shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') with gr.Column():