Remove ExLlamaV2 backend

- archived upstream: 7dc12af3a8 - replaced by ExLlamaV3, which has much better quantization accuracy
2026-04-06 07:03:37 +00:00 · 2026-03-05 13:57:21 -08:00 · 2026-03-05 13:57:21 -08:00 · 2f08dce7b0
commit 2f08dce7b0
parent 134ac8fc29
19 changed files with 22 additions and 713 deletions
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@ -5,10 +5,7 @@ from modules.logging_colors import logger


 def add_lora_to_model(lora_names):
-    if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
-        add_lora_exllamav2(lora_names)
-    else:
-        add_lora_transformers(lora_names)
+    add_lora_transformers(lora_names)


 def get_lora_path(lora_name):
@ -19,32 +16,6 @@ def get_lora_path(lora_name):
    return Path(f"{shared.args.lora_dir}/{lora_name}")


-def add_lora_exllamav2(lora_names):
-
-    from exllamav2 import ExLlamaV2Lora
-
-    if isinstance(shared.model.loras, list):
-        for lora in shared.model.loras:
-            lora.unload()
-
-    if len(lora_names) > 0:
-        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join(lora_names)))
-        shared.model.loras = []
-        for lora_name in lora_names:
-            lora_path = get_lora_path(lora_name)
-            if shared.model.__class__.__name__ == 'Exllamav2Model':
-                lora = ExLlamaV2Lora.from_directory(shared.model.model, str(lora_path))
-            else:
-                lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path))
-
-            shared.model.loras.append(lora)
-
-        shared.lora_names = lora_names
-    else:
-        shared.lora_names = []
-        shared.model.loras = None
-
-
 def add_lora_transformers(lora_names):
    from peft import PeftModel

--- a/modules/evaluate.py
+++ b/modules/evaluate.py
@ -46,10 +46,6 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
        logger.error("Perplexity evaluation is not implemented for the llama.cpp loader.")
        raise ValueError

-    if shared.args.loader == "ExLlamav2":
-        logger.error("ExLlamav2_HF is required for perplexity evaluation with EXL2 models. Please reload the model with ExLlamav2_HF instead of ExLlamav2.")
-        raise ValueError
-
    if not shared.args.no_use_fast:
        logger.warning("--no_use_fast is not set. If tokenizing the input dataset takes a long time, try reloading the model with that option set/checked.")

--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@ -1,247 +0,0 @@
-import json
-import traceback
-from pathlib import Path
-
-import torch
-from exllamav2 import (
-    ExLlamaV2,
-    ExLlamaV2Cache,
-    ExLlamaV2Cache_8bit,
-    ExLlamaV2Cache_Q4,
-    ExLlamaV2Cache_Q6,
-    ExLlamaV2Cache_Q8,
-    ExLlamaV2Cache_TP,
-    ExLlamaV2Config,
-    ExLlamaV2Tokenizer
-)
-from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
-
-from modules import shared
-from modules.logging_colors import logger
-from modules.text_generation import get_max_prompt_length
-
-try:
-    import flash_attn
-except Exception:
-    logger.warning('Failed to load flash-attention due to the following error:\n')
-    traceback.print_exc()
-
-
-class Exllamav2Model:
-    def __init__(self):
-        pass
-
-    @classmethod
-    def from_pretrained(self, path_to_model):
-
-        path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
-
-        config = ExLlamaV2Config()
-        config.model_dir = str(path_to_model)
-        config.prepare()
-
-        config.max_seq_len = shared.args.ctx_size
-        config.scale_pos_emb = shared.args.compress_pos_emb
-        config.scale_alpha_value = shared.args.alpha_value
-        config.no_flash_attn = shared.args.no_flash_attn
-        config.no_xformers = shared.args.no_xformers
-        config.no_sdpa = shared.args.no_sdpa
-        config.num_experts_per_token = int(shared.args.num_experts_per_token)
-
-        model = ExLlamaV2(config)
-
-        split = None
-        if shared.args.gpu_split:
-            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
-
-        if shared.args.enable_tp:
-            model.load_tp(split)
-        elif not shared.args.autosplit:
-            model.load(split)
-
-        # Determine the correct cache type
-        kv_cache_type = shared.args.cache_type.lower()
-
-        if kv_cache_type == 'fp16':
-            cache_type = ExLlamaV2Cache
-        elif kv_cache_type == 'fp8':
-            cache_type = ExLlamaV2Cache_8bit
-        elif kv_cache_type == 'q8':
-            cache_type = ExLlamaV2Cache_Q8
-        elif kv_cache_type == 'q6':
-            cache_type = ExLlamaV2Cache_Q6
-        elif kv_cache_type == 'q4':
-            cache_type = ExLlamaV2Cache_Q4
-        else:
-            raise ValueError(f"Invalid cache type for ExLlamaV2: {cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
-
-        # Use TP if specified
-        if shared.args.enable_tp:
-            cache = ExLlamaV2Cache_TP(model, base=cache_type)
-        else:
-            cache = cache_type(model, lazy=shared.args.autosplit)
-
-        if shared.args.autosplit and not shared.args.enable_tp:
-            model.load_autosplit(cache)
-
-        tokenizer = ExLlamaV2Tokenizer(config)
-
-        # Initialize draft model for speculative decoding
-        draft_model = None
-        draft_cache = None
-
-        if shared.args.model_draft and shared.args.model_draft.lower() not in ["none", ""]:
-            logger.info(f"Loading draft model for speculative decoding: {shared.args.model_draft}")
-
-            # Find the draft model path
-            draft_path = Path(shared.args.model_draft)
-            if not draft_path.exists():
-                draft_path = Path(f'{shared.args.model_dir}') / Path(shared.args.model_draft)
-
-            draft_config = ExLlamaV2Config()
-            draft_config.model_dir = str(draft_path)
-            draft_config.prepare()
-            draft_config.arch_compat_overrides()
-
-            # Set context size for draft model
-            if shared.args.ctx_size_draft > 0:
-                draft_config.max_seq_len = shared.args.ctx_size_draft
-            else:
-                draft_config.max_seq_len = config.max_seq_len
-
-            draft_model = ExLlamaV2(draft_config)
-            draft_cache = cache_type(draft_model, lazy=True)
-            draft_model.load_autosplit(draft_cache)
-
-            logger.info(f"Draft model loaded successfully with max_draft={shared.args.draft_max}")
-
-        generator = ExLlamaV2StreamingGenerator(
-            model,
-            cache,
-            tokenizer,
-            draft_model=draft_model,
-            draft_cache=draft_cache,
-            num_speculative_tokens=shared.args.draft_max if draft_model is not None else 0
-        )
-
-        result = self()
-        result.model = model
-        result.cache = cache
-        result.tokenizer = tokenizer
-        result.generator = generator
-        result.loras = None
-        result.draft_model = draft_model
-        result.draft_cache = draft_cache
-        return result, result
-
-    def encode(self, string, **kwargs):
-        add_bos = kwargs.pop('add_bos', True)
-        return self.tokenizer.encode(string, add_bos=add_bos, encode_special_tokens=True, **kwargs)
-
-    def decode(self, ids, **kwargs):
-        if isinstance(ids, list):
-            ids = torch.tensor([ids])
-        elif isinstance(ids, torch.Tensor) and ids.numel() == 1:
-            ids = ids.view(1, -1)
-
-        return self.tokenizer.decode(ids, decode_special_tokens=True)[0]
-
-    def get_logits(self, token_ids, **kwargs):
-        self.cache.current_seq_len = 0
-        if token_ids.shape[-1] > 1:
-            self.model.forward(token_ids[:, :-1], self.cache, input_mask=None, preprocess_only=True, loras=self.loras)
-
-        return self.model.forward(token_ids[:, -1:], self.cache, input_mask=None, loras=self.loras, **kwargs).float().cpu()
-
-    def generate_with_streaming(self, prompt, state):
-        settings = ExLlamaV2Sampler.Settings()
-
-        settings.token_repetition_penalty = state['repetition_penalty']
-        settings.token_repetition_range = -1 if state['repetition_penalty_range'] <= 0 else state['repetition_penalty_range']
-
-        settings.token_frequency_penalty = state['frequency_penalty']
-        settings.token_presence_penalty = state['presence_penalty']
-
-        settings.temperature = state['temperature']
-        settings.smoothing_factor = state['smoothing_factor']
-        settings.min_temp = state['dynatemp_low'] if state['dynamic_temperature'] else 0
-        settings.max_temp = state['dynatemp_high'] if state['dynamic_temperature'] else 0
-        settings.temp_exponent = state['dynatemp_exponent']
-        settings.top_k = state['top_k']
-        settings.top_p = state['top_p']
-        settings.top_a = state['top_a']
-        settings.min_p = state['min_p']
-        settings.tfs = state['tfs']
-        settings.typical = state['typical_p']
-
-        settings.temperature_last = state['temperature_last']
-
-        settings.mirostat = state['mirostat_mode'] == 2
-        settings.mirostat_tau = state['mirostat_tau']
-        settings.mirostat_eta = state['mirostat_eta']
-
-        if state['ban_eos_token']:
-            settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
-
-        if state['custom_token_bans']:
-            to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
-            if len(to_ban) > 0:
-                settings.disallow_tokens(self.tokenizer, to_ban)
-
-        settings.dry_allowed_length = state['dry_allowed_length']
-        settings.dry_base = state['dry_base']
-        settings.dry_multiplier = state['dry_multiplier']
-
-        # Dry sequence breakers processing
-        if state['dry_multiplier'] > 0 and state['dry_sequence_breakers']:
-            dry_sequence_breakers = state['dry_sequence_breakers']
-
-            # Support both JSON array notation and comma-separated strings.
-            if not dry_sequence_breakers.startswith("["):
-                dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
-
-            sequence_breaker_strings = json.loads(dry_sequence_breakers)
-            # Prefix with 'a' to get the correct encoding of the token at the end of a text.
-            sequence_breakers = {
-                self.encode(f"a{s}")[0, -1].item() for s in sequence_breaker_strings
-            }
-
-            settings.dry_sequence_breakers = sequence_breakers
-
-        settings.xtc_probability = state['xtc_probability']
-        settings.xtc_threshold = state['xtc_threshold']
-
-        ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True)
-        ids = ids[:, -get_max_prompt_length(state):]
-
-        if state['auto_max_new_tokens']:
-            max_new_tokens = state['truncation_length'] - ids.shape[-1]
-        else:
-            max_new_tokens = state['max_new_tokens']
-
-        # Reset speculative decoding stats if using a draft model
-        if hasattr(self, 'draft_model') and self.draft_model is not None:
-            self.generator.reset_sd_stats()
-
-        self.generator.begin_stream(ids, settings, loras=self.loras)
-
-        decoded_text = ''
-        for i in range(max_new_tokens):
-            chunk, eos, _ = self.generator.stream()
-            if eos or shared.stop_everything:
-                break
-
-            decoded_text += chunk
-            yield decoded_text
-
-        # Log speculative decoding stats if using draft model
-        if hasattr(self, 'draft_model') and self.draft_model is not None:
-            efficiency, accuracy, total_tokens, total_draft_tokens, accepted_draft_tokens = self.generator.get_sd_stats()
-            logger.info(f"Speculative decoding: accepted={accepted_draft_tokens}/{total_draft_tokens} tokens")
-
-    def generate(self, prompt, state):
-        output = ''
-        for output in self.generate_with_streaming(prompt, state):
-            pass
-
-        return output
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@ -1,203 +0,0 @@
-import os
-import traceback
-from pathlib import Path
-from typing import Any, Dict, Optional, Union
-
-import torch
-from exllamav2 import (
-    ExLlamaV2,
-    ExLlamaV2Cache,
-    ExLlamaV2Cache_8bit,
-    ExLlamaV2Cache_Q4,
-    ExLlamaV2Cache_Q6,
-    ExLlamaV2Cache_Q8,
-    ExLlamaV2Cache_TP,
-    ExLlamaV2Config
-)
-from torch.nn import CrossEntropyLoss
-from transformers import (
-    GenerationConfig,
-    GenerationMixin,
-    PretrainedConfig,
-    PreTrainedModel
-)
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
-from modules import shared
-from modules.logging_colors import logger
-
-try:
-    import flash_attn
-except Exception:
-    logger.warning('Failed to load flash-attention due to the following error:\n')
-    traceback.print_exc()
-
-
-class Exllamav2HF(PreTrainedModel, GenerationMixin):
-    def __init__(self, config: ExLlamaV2Config):
-        hf_config = PretrainedConfig.from_pretrained(config.model_dir)
-        super().__init__(hf_config)
-
-        self.ex_config = config
-        self.loras = None
-        self.generation_config = GenerationConfig()
-
-        self.ex_model = ExLlamaV2(config)
-
-        split = None
-        if shared.args.gpu_split:
-            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
-
-        if shared.args.enable_tp:
-            self.ex_model.load_tp(split)
-        elif not shared.args.autosplit:
-            self.ex_model.load(split)
-
-        # Determine the correct cache type
-        kv_cache_type = shared.args.cache_type.lower()
-
-        if kv_cache_type == 'fp16':
-            cache_type = ExLlamaV2Cache
-        elif kv_cache_type == 'fp8':
-            cache_type = ExLlamaV2Cache_8bit
-        elif kv_cache_type == 'q8':
-            cache_type = ExLlamaV2Cache_Q8
-        elif kv_cache_type == 'q6':
-            cache_type = ExLlamaV2Cache_Q6
-        elif kv_cache_type == 'q4':
-            cache_type = ExLlamaV2Cache_Q4
-        else:
-            raise ValueError(f"Invalid cache type for ExLlamaV2: {kv_cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
-
-        # Use TP if specified
-        if shared.args.enable_tp:
-            self.ex_cache = ExLlamaV2Cache_TP(self.ex_model, base=cache_type)
-        else:
-            self.ex_cache = cache_type(self.ex_model, lazy=shared.args.autosplit)
-
-        if shared.args.autosplit and not shared.args.enable_tp:
-            self.ex_model.load_autosplit(self.ex_cache)
-
-        self.past_seq = None
-        if shared.args.cfg_cache:
-            if shared.args.enable_tp:
-                self.ex_cache_negative = ExLlamaV2Cache_TP(self.ex_model, base=cache_type)
-            else:
-                self.ex_cache_negative = cache_type(self.ex_model, lazy=shared.args.autosplit)
-
-            self.past_seq_negative = None
-
-    def _validate_model_class(self):
-        pass
-
-    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
-        pass
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        return {'input_ids': input_ids, **kwargs}
-
-    @property
-    def device(self) -> torch.device:
-        return torch.device(0)
-
-    def __call__(self, *args, **kwargs):
-        use_cache = kwargs.get('use_cache', True)
-        labels = kwargs.get('labels', None)
-        past_key_values = kwargs.get('past_key_values', None)
-
-        if len(args) > 0:
-            if not shared.args.cfg_cache:
-                logger.error("Please enable the cfg-cache option to use CFG with ExLlamav2_HF.")
-                return
-
-            input_ids = args[0]
-            is_negative = True
-            past_seq = self.past_seq_negative
-            ex_cache = self.ex_cache_negative
-        else:
-            input_ids = kwargs['input_ids']
-            is_negative = False
-            past_seq = self.past_seq
-            ex_cache = self.ex_cache
-
-        seq = input_ids[0].tolist()
-        if is_negative and past_key_values is not None:
-            seq = past_key_values + seq
-
-        seq_tensor = torch.tensor(seq)
-        reset = True
-
-        # Make the forward call
-        if labels is None:
-            if past_seq is not None:
-                min_length = min(past_seq.shape[0], seq_tensor.shape[0])
-                indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
-                if len(indices) > 0:
-                    longest_prefix = indices[0].item()
-                else:
-                    longest_prefix = min_length
-
-                if longest_prefix > 0:
-                    reset = False
-                    ex_cache.current_seq_len = longest_prefix
-                    if len(seq_tensor) - longest_prefix > 1:
-                        self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)
-                    elif len(seq_tensor) == longest_prefix:
-                        # Very tricky: if the prefix we are reusing *is* the input_ids, then we have to back up the cache pointer by one,
-                        # because we feed input_ids[-1] to forward() below, but that last token is already in the cache!
-                        ex_cache.current_seq_len -= 1
-
-            if reset:
-                ex_cache.current_seq_len = 0
-                if len(seq_tensor) > 1:
-                    self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)
-
-            logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache, loras=self.loras).to(input_ids.device).float()
-        else:
-            ex_cache.current_seq_len = 0
-            logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False, loras=self.loras).float()
-
-        if is_negative:
-            self.past_seq_negative = seq_tensor
-        else:
-            self.past_seq = seq_tensor
-
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, logits.shape[-1])
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
-        assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
-        if isinstance(pretrained_model_name_or_path, str):
-            pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
-
-        pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
-
-        config = ExLlamaV2Config()
-        config.model_dir = str(pretrained_model_name_or_path)
-        config.prepare()
-
-        config.max_seq_len = shared.args.ctx_size
-        config.scale_pos_emb = shared.args.compress_pos_emb
-        config.scale_alpha_value = shared.args.alpha_value
-        config.no_flash_attn = shared.args.no_flash_attn
-        config.no_xformers = shared.args.no_xformers
-        config.no_sdpa = shared.args.no_sdpa
-        config.num_experts_per_token = int(shared.args.num_experts_per_token)
-
-        return Exllamav2HF(config)
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@ -59,7 +59,7 @@ class Exllamav3Model:
            logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}")
            max_tokens = adjusted_tokens

-        # Parse cache type (ExLlamaV2 pattern)
+        # Parse cache type
        cache_type = shared.args.cache_type.lower()
        cache_kwargs = {}
        if cache_type == 'fp16':
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -74,38 +74,6 @@ loaders_and_params = OrderedDict({
        'enable_tp',
        'tp_backend',
    ],
-    'ExLlamav2_HF': [
-        'ctx_size',
-        'cache_type',
-        'gpu_split',
-        'alpha_value',
-        'compress_pos_emb',
-        'num_experts_per_token',
-        'autosplit',
-        'enable_tp',
-        'no_flash_attn',
-        'no_xformers',
-        'no_sdpa',
-        'cfg_cache',
-        'no_use_fast',
-    ],
-    'ExLlamav2': [
-        'ctx_size',
-        'cache_type',
-        'gpu_split',
-        'alpha_value',
-        'compress_pos_emb',
-        'num_experts_per_token',
-        'autosplit',
-        'enable_tp',
-        'no_flash_attn',
-        'no_xformers',
-        'no_sdpa',
-        'model_draft',
-        'draft_max',
-        'ctx_size_draft',
-        'speculative_decoding_accordion',
-    ],
    'TensorRT-LLM': [
        'ctx_size',
        'cpp_runner',
@ -222,56 +190,6 @@ loaders_samplers = {
        'grammar_string',
        'grammar_file_row',
    },
-    'ExLlamav2_HF': {
-        'temperature',
-        'dynatemp_low',
-        'dynatemp_high',
-        'dynatemp_exponent',
-        'smoothing_factor',
-        'smoothing_curve',
-        'min_p',
-        'top_p',
-        'top_k',
-        'typical_p',
-        'xtc_threshold',
-        'xtc_probability',
-        'epsilon_cutoff',
-        'eta_cutoff',
-        'tfs',
-        'top_a',
-        'top_n_sigma',
-        'adaptive_target',
-        'adaptive_decay',
-        'dry_multiplier',
-        'dry_allowed_length',
-        'dry_base',
-        'repetition_penalty',
-        'frequency_penalty',
-        'presence_penalty',
-        'encoder_repetition_penalty',
-        'no_repeat_ngram_size',
-        'repetition_penalty_range',
-        'guidance_scale',
-        'mirostat_mode',
-        'mirostat_tau',
-        'mirostat_eta',
-        'do_sample',
-        'dynamic_temperature',
-        'temperature_last',
-        'auto_max_new_tokens',
-        'ban_eos_token',
-        'add_bos_token',
-        'enable_thinking',
-        'reasoning_effort',
-        'skip_special_tokens',
-        'seed',
-        'sampler_priority',
-        'custom_token_bans',
-        'negative_prompt',
-        'dry_sequence_breakers',
-        'grammar_string',
-        'grammar_file_row',
-    },
    'ExLlamav3': {
        'temperature',
        'min_p',
@ -292,41 +210,6 @@ loaders_samplers = {
        'seed',
        'skip_special_tokens',
    },
-    'ExLlamav2': {
-        'temperature',
-        'dynatemp_low',
-        'dynatemp_high',
-        'dynatemp_exponent',
-        'smoothing_factor',
-        'min_p',
-        'top_p',
-        'top_k',
-        'typical_p',
-        'xtc_threshold',
-        'xtc_probability',
-        'tfs',
-        'top_a',
-        'dry_multiplier',
-        'dry_allowed_length',
-        'dry_base',
-        'repetition_penalty',
-        'frequency_penalty',
-        'presence_penalty',
-        'repetition_penalty_range',
-        'mirostat_mode',
-        'mirostat_tau',
-        'mirostat_eta',
-        'dynamic_temperature',
-        'temperature_last',
-        'auto_max_new_tokens',
-        'ban_eos_token',
-        'add_bos_token',
-        'enable_thinking',
-        'reasoning_effort',
-        'skip_special_tokens',
-        'custom_token_bans',
-        'dry_sequence_breakers',
-    },
    'llama.cpp': {
        'temperature',
        'dynatemp_low',
--- a/modules/logits.py
+++ b/modules/logits.py
@ -70,18 +70,12 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
        from modules import sampler_hijack
        from modules.torch_utils import get_device

-        is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
        is_non_hf_exllamav3 = shared.model.__class__.__name__ == 'Exllamav3Model'

        if not use_samplers:
            state = {'stream': True}

        if use_samplers:
-            if is_non_hf_exllamav2:
-                # sampling is all done in C++ for exllama, so it is really hard to hijack
-                logger.error("Sampler hijacking is not supported non-Huggingface loaders.")
-                return 'Error: Sampler hijacking is not supported non-Huggingface loaders. Please disable the "Use samplers" option.', previous
-
            state['max_new_tokens'] = 1
            state['auto_max_new_tokens'] = False
            state.setdefault('stream', True)
@ -90,7 +84,7 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur

            scores = sampler_hijack.global_scores[-1]
        else:
-            if is_non_hf_exllamav2 or is_non_hf_exllamav3:
+            if is_non_hf_exllamav3:
                device = get_device()
                tokens = shared.tokenizer.encode(prompt)
                if device:
--- a/modules/models.py
+++ b/modules/models.py
@ -20,8 +20,6 @@ def load_model(model_name, loader=None):
        'Transformers': transformers_loader,
        'ExLlamav3_HF': ExLlamav3_HF_loader,
        'ExLlamav3': ExLlamav3_loader,
-        'ExLlamav2_HF': ExLlamav2_HF_loader,
-        'ExLlamav2': ExLlamav2_loader,
        'TensorRT-LLM': TensorRT_LLM_loader,
    }

@ -109,19 +107,6 @@ def ExLlamav3_loader(model_name):
    return model, tokenizer


-def ExLlamav2_HF_loader(model_name):
-    from modules.exllamav2_hf import Exllamav2HF
-
-    return Exllamav2HF.from_pretrained(model_name)
-
-
-def ExLlamav2_loader(model_name):
-    from modules.exllamav2 import Exllamav2Model
-
-    model, tokenizer = Exllamav2Model.from_pretrained(model_name)
-    return model, tokenizer
-
-
 def TensorRT_LLM_loader(model_name):
    try:
        from modules.tensorrt_llm import TensorRTLLMModel
@ -141,8 +126,6 @@ def unload_model(keep_model_name=False):

    if model_class_name in ['Exllamav3Model', 'Exllamav3HF']:
        shared.model.unload()
-    elif model_class_name in ['Exllamav2Model', 'Exllamav2HF'] and hasattr(shared.model, 'unload'):
-        shared.model.unload()

    shared.model = shared.tokenizer = None
    shared.lora_names = []
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -216,12 +216,8 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
        loader = 'llama.cpp'
    elif hf_quant_method == 'exl3':
        loader = 'ExLlamav3'
-    elif hf_quant_method in ['exl2', 'gptq']:
-        loader = 'ExLlamav2_HF'
    elif re.match(r'.*exl3', model_name.lower()):
        loader = 'ExLlamav3'
-    elif re.match(r'.*exl2', model_name.lower()):
-        loader = 'ExLlamav2_HF'
    else:
        loader = 'Transformers'

@ -255,7 +251,7 @@ def apply_model_settings_to_state(model, state):
    model_settings = get_model_metadata(model)
    if 'loader' in model_settings:
        loader = model_settings.pop('loader')
-        if not ((loader == 'ExLlamav2_HF' and state['loader'] == 'ExLlamav2') or (loader == 'ExLlamav3_HF' and state['loader'] == 'ExLlamav3')):
+        if not (loader == 'ExLlamav3_HF' and state['loader'] == 'ExLlamav3'):
            state['loader'] = loader

    for k in model_settings:
--- a/modules/shared.py
+++ b/modules/shared.py
@ -67,12 +67,12 @@ group.add_argument('--image-quant', type=str, default=None,

 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav3, TensorRT-LLM.')

 # Cache
 group = parser.add_argument_group('Context and cache')
 group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens. llama.cpp: 0 = auto if gpu-layers is also -1.')
-group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
+group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')

 # Speculative decoding
 group = parser.add_argument_group('Speculative decoding')
@ -127,18 +127,10 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for

 # ExLlamaV3
 group = parser.add_argument_group('ExLlamaV3')
+group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
 group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) to split the model across GPUs.')
 group.add_argument('--tp-backend', type=str, default='native', help='The backend for tensor parallelism. Valid options: native, nccl. Default: native.')
-
-# ExLlamaV2
-group = parser.add_argument_group('ExLlamaV2')
-group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
-group.add_argument('--autosplit', action='store_true', help='Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.')
-group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
-group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
-group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
-group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
-group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
+group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')

 # TensorRT-LLM
 group = parser.add_argument_group('TensorRT-LLM')
@ -379,10 +371,6 @@ def fix_loader_name(name):
        return 'llama.cpp'
    elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
        return 'Transformers'
-    elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
-        return 'ExLlamav2'
-    elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
-        return 'ExLlamav2_HF'
    elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']:
        return 'ExLlamav3_HF'
    elif name in ['exllamav3']:
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -40,7 +40,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
            yield ''
            return

-        if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']:
+        if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav3Model', 'TensorRTLLMModel']:
            generate_func = generate_reply_custom
        else:
            generate_func = generate_reply_HF
@ -128,9 +128,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt

        from modules.torch_utils import get_device

-        if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']:
+        if shared.model.__class__.__name__ in ['Exllamav3Model', 'TensorRTLLMModel']:
            input_ids = shared.tokenizer.encode(str(prompt))
-            if shared.model.__class__.__name__ not in ['Exllamav2Model', 'Exllamav3Model']:
+            if shared.model.__class__.__name__ not in ['Exllamav3Model']:
                input_ids = np.array(input_ids).reshape(1, len(input_ids))
        else:
            input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
@ -148,7 +148,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
        if truncation_length is not None:
            input_ids = input_ids[:, -truncation_length:]

-        if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel'] or shared.args.cpu:
+        if shared.model.__class__.__name__ in ['Exllamav3Model', 'TensorRTLLMModel'] or shared.args.cpu:
            return input_ids
        else:
            device = get_device()
--- a/modules/ui.py
+++ b/modules/ui.py
@ -141,7 +141,6 @@ def list_model_elements():
        'compress_pos_emb',
        'compute_dtype',
        'quant_type',
-        'num_experts_per_token',
        'load_in_8bit',
        'load_in_4bit',
        'attn_implementation',
@ -154,12 +153,8 @@ def list_model_elements():
        'numa',
        'use_double_quant',
        'bf16',
-        'autosplit',
        'enable_tp',
        'tp_backend',
-        'no_flash_attn',
-        'no_xformers',
-        'no_sdpa',
        'cfg_cache',
        'cpp_runner',
        'no_use_fast',
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -45,7 +45,7 @@ def create_ui():
                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=0, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. llama.cpp: 0 = auto if gpu-layers is also -1. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                            shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
-                            shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
+                            shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
                            shared.gradio['tp_backend'] = gr.Dropdown(label="tp-backend", choices=['native', 'nccl'], value=shared.args.tp_backend, info='The backend for tensor parallelism.')

                        with gr.Column():
@ -55,7 +55,6 @@ def create_ui():
                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
-                            shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
                            shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).')
                            shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
                            shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
@ -101,7 +100,6 @@ def create_ui():
                                shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
                                shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
                                shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
-                                shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')

                            with gr.Column():
                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='Use PyTorch in CPU mode.')
@ -112,9 +110,6 @@ def create_ui():
                                shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                                shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
                                shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
-                                shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
-                                shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
-                                shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
                                shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                                shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
                                if not shared.args.portable: