diff --git a/README.md b/README.md index ee5a04bf..7105ce23 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features -- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). - - [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is also supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile). - - Additional quantization libraries like [HQQ](https://github.com/mobiusml/hqq) and [AQLM](https://github.com/Vahe1994/AQLM) can be used with the Transformers loader if you install them manually. +- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)). - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment. - UI that resembles the original ChatGPT style. - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats. diff --git a/modules/loaders.py b/modules/loaders.py index 79a7a4a3..6fbd2198 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -90,11 +90,6 @@ loaders_and_params = OrderedDict({ 'ctx_size_draft', 'speculative_decoding_accordion', ], - 'HQQ': [ - 'hqq_backend', - 'trust_remote_code', - 'no_use_fast', - ], 'TensorRT-LLM': [ 'ctx_size', 'cpp_runner', @@ -158,7 +153,6 @@ def transformers_samplers(): loaders_samplers = { 'Transformers': transformers_samplers(), - 'HQQ': transformers_samplers(), 'ExLlamav3_HF': { 'temperature', 'dynatemp_low', diff --git a/modules/models.py b/modules/models.py index 9ecee803..4218d58c 100644 --- a/modules/models.py +++ b/modules/models.py @@ -21,7 +21,6 @@ def load_model(model_name, loader=None): 'ExLlamav3_HF': ExLlamav3_HF_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ExLlamav2': ExLlamav2_loader, - 'HQQ': HQQ_loader, 'TensorRT-LLM': TensorRT_LLM_loader, } @@ -102,21 +101,6 @@ def ExLlamav2_loader(model_name): return model, tokenizer -def HQQ_loader(model_name): - try: - from hqq.core.quantize import HQQBackend, HQQLinear - from hqq.models.hf.base import AutoHQQHFModel - except ModuleNotFoundError: - raise ModuleNotFoundError("Failed to import 'hqq'. Please install it manually following the instructions in the HQQ GitHub repository.") - - logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"") - - model_dir = Path(f'{shared.args.model_dir}/{model_name}') - model = AutoHQQHFModel.from_quantized(str(model_dir)) - HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend)) - return model - - def TensorRT_LLM_loader(model_name): try: from modules.tensorrt_llm import TensorRTLLMModel diff --git a/modules/models_settings.py b/modules/models_settings.py index 47dbc020..e742e0d8 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -183,8 +183,6 @@ def infer_loader(model_name, model_settings, hf_quant_method=None): loader = 'ExLlamav3_HF' elif re.match(r'.*exl2', model_name.lower()): loader = 'ExLlamav2_HF' - elif re.match(r'.*-hqq', model_name.lower()): - return 'HQQ' else: loader = 'Transformers' diff --git a/modules/shared.py b/modules/shared.py index a6c0cbe9..d2305f30 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -87,7 +87,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -152,10 +152,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.') group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.') -# HQQ -group = parser.add_argument_group('HQQ') -group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') - # TensorRT-LLM group = parser.add_argument_group('TensorRT-LLM') group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.') @@ -263,8 +259,6 @@ def fix_loader_name(name): return 'ExLlamav2_HF' elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']: return 'ExLlamav3_HF' - elif name in ['hqq']: - return 'HQQ' elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']: return 'TensorRT-LLM' diff --git a/modules/ui.py b/modules/ui.py index 25f93612..f5dc0632 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -109,7 +109,6 @@ def list_model_elements(): 'threads', 'threads_batch', 'batch_size', - 'hqq_backend', 'ctx_size', 'cache_type', 'tensor_split', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 85cf4189..d361f692 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -42,8 +42,6 @@ def create_ui(): shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') - shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) - with gr.Column(): shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')