Remove ExLlamaV2 backend

- archived upstream: 7dc12af3a8
- replaced by ExLlamaV3, which has much better quantization accuracy
This commit is contained in:
oobabooga 2026-03-05 13:57:21 -08:00
parent 134ac8fc29
commit 2f08dce7b0
19 changed files with 22 additions and 713 deletions

View file

@ -20,8 +20,6 @@ def load_model(model_name, loader=None):
'Transformers': transformers_loader,
'ExLlamav3_HF': ExLlamav3_HF_loader,
'ExLlamav3': ExLlamav3_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ExLlamav2': ExLlamav2_loader,
'TensorRT-LLM': TensorRT_LLM_loader,
}
@ -109,19 +107,6 @@ def ExLlamav3_loader(model_name):
return model, tokenizer
def ExLlamav2_HF_loader(model_name):
from modules.exllamav2_hf import Exllamav2HF
return Exllamav2HF.from_pretrained(model_name)
def ExLlamav2_loader(model_name):
from modules.exllamav2 import Exllamav2Model
model, tokenizer = Exllamav2Model.from_pretrained(model_name)
return model, tokenizer
def TensorRT_LLM_loader(model_name):
try:
from modules.tensorrt_llm import TensorRTLLMModel
@ -141,8 +126,6 @@ def unload_model(keep_model_name=False):
if model_class_name in ['Exllamav3Model', 'Exllamav3HF']:
shared.model.unload()
elif model_class_name in ['Exllamav2Model', 'Exllamav2HF'] and hasattr(shared.model, 'unload'):
shared.model.unload()
shared.model = shared.tokenizer = None
shared.lora_names = []