mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-04-06 07:03:37 +00:00
Remove ExLlamaV2 backend
- archived upstream: 7dc12af3a8
- replaced by ExLlamaV3, which has much better quantization accuracy
This commit is contained in:
parent
134ac8fc29
commit
2f08dce7b0
19 changed files with 22 additions and 713 deletions
|
|
@ -5,10 +5,7 @@ from modules.logging_colors import logger
|
|||
|
||||
|
||||
def add_lora_to_model(lora_names):
|
||||
if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
|
||||
add_lora_exllamav2(lora_names)
|
||||
else:
|
||||
add_lora_transformers(lora_names)
|
||||
add_lora_transformers(lora_names)
|
||||
|
||||
|
||||
def get_lora_path(lora_name):
|
||||
|
|
@ -19,32 +16,6 @@ def get_lora_path(lora_name):
|
|||
return Path(f"{shared.args.lora_dir}/{lora_name}")
|
||||
|
||||
|
||||
def add_lora_exllamav2(lora_names):
|
||||
|
||||
from exllamav2 import ExLlamaV2Lora
|
||||
|
||||
if isinstance(shared.model.loras, list):
|
||||
for lora in shared.model.loras:
|
||||
lora.unload()
|
||||
|
||||
if len(lora_names) > 0:
|
||||
logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join(lora_names)))
|
||||
shared.model.loras = []
|
||||
for lora_name in lora_names:
|
||||
lora_path = get_lora_path(lora_name)
|
||||
if shared.model.__class__.__name__ == 'Exllamav2Model':
|
||||
lora = ExLlamaV2Lora.from_directory(shared.model.model, str(lora_path))
|
||||
else:
|
||||
lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path))
|
||||
|
||||
shared.model.loras.append(lora)
|
||||
|
||||
shared.lora_names = lora_names
|
||||
else:
|
||||
shared.lora_names = []
|
||||
shared.model.loras = None
|
||||
|
||||
|
||||
def add_lora_transformers(lora_names):
|
||||
from peft import PeftModel
|
||||
|
||||
|
|
|
|||
|
|
@ -46,10 +46,6 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
|
|||
logger.error("Perplexity evaluation is not implemented for the llama.cpp loader.")
|
||||
raise ValueError
|
||||
|
||||
if shared.args.loader == "ExLlamav2":
|
||||
logger.error("ExLlamav2_HF is required for perplexity evaluation with EXL2 models. Please reload the model with ExLlamav2_HF instead of ExLlamav2.")
|
||||
raise ValueError
|
||||
|
||||
if not shared.args.no_use_fast:
|
||||
logger.warning("--no_use_fast is not set. If tokenizing the input dataset takes a long time, try reloading the model with that option set/checked.")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,247 +0,0 @@
|
|||
import json
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from exllamav2 import (
|
||||
ExLlamaV2,
|
||||
ExLlamaV2Cache,
|
||||
ExLlamaV2Cache_8bit,
|
||||
ExLlamaV2Cache_Q4,
|
||||
ExLlamaV2Cache_Q6,
|
||||
ExLlamaV2Cache_Q8,
|
||||
ExLlamaV2Cache_TP,
|
||||
ExLlamaV2Config,
|
||||
ExLlamaV2Tokenizer
|
||||
)
|
||||
from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
|
||||
|
||||
from modules import shared
|
||||
from modules.logging_colors import logger
|
||||
from modules.text_generation import get_max_prompt_length
|
||||
|
||||
try:
|
||||
import flash_attn
|
||||
except Exception:
|
||||
logger.warning('Failed to load flash-attention due to the following error:\n')
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
class Exllamav2Model:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, path_to_model):
|
||||
|
||||
path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
|
||||
|
||||
config = ExLlamaV2Config()
|
||||
config.model_dir = str(path_to_model)
|
||||
config.prepare()
|
||||
|
||||
config.max_seq_len = shared.args.ctx_size
|
||||
config.scale_pos_emb = shared.args.compress_pos_emb
|
||||
config.scale_alpha_value = shared.args.alpha_value
|
||||
config.no_flash_attn = shared.args.no_flash_attn
|
||||
config.no_xformers = shared.args.no_xformers
|
||||
config.no_sdpa = shared.args.no_sdpa
|
||||
config.num_experts_per_token = int(shared.args.num_experts_per_token)
|
||||
|
||||
model = ExLlamaV2(config)
|
||||
|
||||
split = None
|
||||
if shared.args.gpu_split:
|
||||
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
|
||||
|
||||
if shared.args.enable_tp:
|
||||
model.load_tp(split)
|
||||
elif not shared.args.autosplit:
|
||||
model.load(split)
|
||||
|
||||
# Determine the correct cache type
|
||||
kv_cache_type = shared.args.cache_type.lower()
|
||||
|
||||
if kv_cache_type == 'fp16':
|
||||
cache_type = ExLlamaV2Cache
|
||||
elif kv_cache_type == 'fp8':
|
||||
cache_type = ExLlamaV2Cache_8bit
|
||||
elif kv_cache_type == 'q8':
|
||||
cache_type = ExLlamaV2Cache_Q8
|
||||
elif kv_cache_type == 'q6':
|
||||
cache_type = ExLlamaV2Cache_Q6
|
||||
elif kv_cache_type == 'q4':
|
||||
cache_type = ExLlamaV2Cache_Q4
|
||||
else:
|
||||
raise ValueError(f"Invalid cache type for ExLlamaV2: {cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
|
||||
|
||||
# Use TP if specified
|
||||
if shared.args.enable_tp:
|
||||
cache = ExLlamaV2Cache_TP(model, base=cache_type)
|
||||
else:
|
||||
cache = cache_type(model, lazy=shared.args.autosplit)
|
||||
|
||||
if shared.args.autosplit and not shared.args.enable_tp:
|
||||
model.load_autosplit(cache)
|
||||
|
||||
tokenizer = ExLlamaV2Tokenizer(config)
|
||||
|
||||
# Initialize draft model for speculative decoding
|
||||
draft_model = None
|
||||
draft_cache = None
|
||||
|
||||
if shared.args.model_draft and shared.args.model_draft.lower() not in ["none", ""]:
|
||||
logger.info(f"Loading draft model for speculative decoding: {shared.args.model_draft}")
|
||||
|
||||
# Find the draft model path
|
||||
draft_path = Path(shared.args.model_draft)
|
||||
if not draft_path.exists():
|
||||
draft_path = Path(f'{shared.args.model_dir}') / Path(shared.args.model_draft)
|
||||
|
||||
draft_config = ExLlamaV2Config()
|
||||
draft_config.model_dir = str(draft_path)
|
||||
draft_config.prepare()
|
||||
draft_config.arch_compat_overrides()
|
||||
|
||||
# Set context size for draft model
|
||||
if shared.args.ctx_size_draft > 0:
|
||||
draft_config.max_seq_len = shared.args.ctx_size_draft
|
||||
else:
|
||||
draft_config.max_seq_len = config.max_seq_len
|
||||
|
||||
draft_model = ExLlamaV2(draft_config)
|
||||
draft_cache = cache_type(draft_model, lazy=True)
|
||||
draft_model.load_autosplit(draft_cache)
|
||||
|
||||
logger.info(f"Draft model loaded successfully with max_draft={shared.args.draft_max}")
|
||||
|
||||
generator = ExLlamaV2StreamingGenerator(
|
||||
model,
|
||||
cache,
|
||||
tokenizer,
|
||||
draft_model=draft_model,
|
||||
draft_cache=draft_cache,
|
||||
num_speculative_tokens=shared.args.draft_max if draft_model is not None else 0
|
||||
)
|
||||
|
||||
result = self()
|
||||
result.model = model
|
||||
result.cache = cache
|
||||
result.tokenizer = tokenizer
|
||||
result.generator = generator
|
||||
result.loras = None
|
||||
result.draft_model = draft_model
|
||||
result.draft_cache = draft_cache
|
||||
return result, result
|
||||
|
||||
def encode(self, string, **kwargs):
|
||||
add_bos = kwargs.pop('add_bos', True)
|
||||
return self.tokenizer.encode(string, add_bos=add_bos, encode_special_tokens=True, **kwargs)
|
||||
|
||||
def decode(self, ids, **kwargs):
|
||||
if isinstance(ids, list):
|
||||
ids = torch.tensor([ids])
|
||||
elif isinstance(ids, torch.Tensor) and ids.numel() == 1:
|
||||
ids = ids.view(1, -1)
|
||||
|
||||
return self.tokenizer.decode(ids, decode_special_tokens=True)[0]
|
||||
|
||||
def get_logits(self, token_ids, **kwargs):
|
||||
self.cache.current_seq_len = 0
|
||||
if token_ids.shape[-1] > 1:
|
||||
self.model.forward(token_ids[:, :-1], self.cache, input_mask=None, preprocess_only=True, loras=self.loras)
|
||||
|
||||
return self.model.forward(token_ids[:, -1:], self.cache, input_mask=None, loras=self.loras, **kwargs).float().cpu()
|
||||
|
||||
def generate_with_streaming(self, prompt, state):
|
||||
settings = ExLlamaV2Sampler.Settings()
|
||||
|
||||
settings.token_repetition_penalty = state['repetition_penalty']
|
||||
settings.token_repetition_range = -1 if state['repetition_penalty_range'] <= 0 else state['repetition_penalty_range']
|
||||
|
||||
settings.token_frequency_penalty = state['frequency_penalty']
|
||||
settings.token_presence_penalty = state['presence_penalty']
|
||||
|
||||
settings.temperature = state['temperature']
|
||||
settings.smoothing_factor = state['smoothing_factor']
|
||||
settings.min_temp = state['dynatemp_low'] if state['dynamic_temperature'] else 0
|
||||
settings.max_temp = state['dynatemp_high'] if state['dynamic_temperature'] else 0
|
||||
settings.temp_exponent = state['dynatemp_exponent']
|
||||
settings.top_k = state['top_k']
|
||||
settings.top_p = state['top_p']
|
||||
settings.top_a = state['top_a']
|
||||
settings.min_p = state['min_p']
|
||||
settings.tfs = state['tfs']
|
||||
settings.typical = state['typical_p']
|
||||
|
||||
settings.temperature_last = state['temperature_last']
|
||||
|
||||
settings.mirostat = state['mirostat_mode'] == 2
|
||||
settings.mirostat_tau = state['mirostat_tau']
|
||||
settings.mirostat_eta = state['mirostat_eta']
|
||||
|
||||
if state['ban_eos_token']:
|
||||
settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
|
||||
|
||||
if state['custom_token_bans']:
|
||||
to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
|
||||
if len(to_ban) > 0:
|
||||
settings.disallow_tokens(self.tokenizer, to_ban)
|
||||
|
||||
settings.dry_allowed_length = state['dry_allowed_length']
|
||||
settings.dry_base = state['dry_base']
|
||||
settings.dry_multiplier = state['dry_multiplier']
|
||||
|
||||
# Dry sequence breakers processing
|
||||
if state['dry_multiplier'] > 0 and state['dry_sequence_breakers']:
|
||||
dry_sequence_breakers = state['dry_sequence_breakers']
|
||||
|
||||
# Support both JSON array notation and comma-separated strings.
|
||||
if not dry_sequence_breakers.startswith("["):
|
||||
dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
|
||||
|
||||
sequence_breaker_strings = json.loads(dry_sequence_breakers)
|
||||
# Prefix with 'a' to get the correct encoding of the token at the end of a text.
|
||||
sequence_breakers = {
|
||||
self.encode(f"a{s}")[0, -1].item() for s in sequence_breaker_strings
|
||||
}
|
||||
|
||||
settings.dry_sequence_breakers = sequence_breakers
|
||||
|
||||
settings.xtc_probability = state['xtc_probability']
|
||||
settings.xtc_threshold = state['xtc_threshold']
|
||||
|
||||
ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True)
|
||||
ids = ids[:, -get_max_prompt_length(state):]
|
||||
|
||||
if state['auto_max_new_tokens']:
|
||||
max_new_tokens = state['truncation_length'] - ids.shape[-1]
|
||||
else:
|
||||
max_new_tokens = state['max_new_tokens']
|
||||
|
||||
# Reset speculative decoding stats if using a draft model
|
||||
if hasattr(self, 'draft_model') and self.draft_model is not None:
|
||||
self.generator.reset_sd_stats()
|
||||
|
||||
self.generator.begin_stream(ids, settings, loras=self.loras)
|
||||
|
||||
decoded_text = ''
|
||||
for i in range(max_new_tokens):
|
||||
chunk, eos, _ = self.generator.stream()
|
||||
if eos or shared.stop_everything:
|
||||
break
|
||||
|
||||
decoded_text += chunk
|
||||
yield decoded_text
|
||||
|
||||
# Log speculative decoding stats if using draft model
|
||||
if hasattr(self, 'draft_model') and self.draft_model is not None:
|
||||
efficiency, accuracy, total_tokens, total_draft_tokens, accepted_draft_tokens = self.generator.get_sd_stats()
|
||||
logger.info(f"Speculative decoding: accepted={accepted_draft_tokens}/{total_draft_tokens} tokens")
|
||||
|
||||
def generate(self, prompt, state):
|
||||
output = ''
|
||||
for output in self.generate_with_streaming(prompt, state):
|
||||
pass
|
||||
|
||||
return output
|
||||
|
|
@ -1,203 +0,0 @@
|
|||
import os
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import torch
|
||||
from exllamav2 import (
|
||||
ExLlamaV2,
|
||||
ExLlamaV2Cache,
|
||||
ExLlamaV2Cache_8bit,
|
||||
ExLlamaV2Cache_Q4,
|
||||
ExLlamaV2Cache_Q6,
|
||||
ExLlamaV2Cache_Q8,
|
||||
ExLlamaV2Cache_TP,
|
||||
ExLlamaV2Config
|
||||
)
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from transformers import (
|
||||
GenerationConfig,
|
||||
GenerationMixin,
|
||||
PretrainedConfig,
|
||||
PreTrainedModel
|
||||
)
|
||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||
|
||||
from modules import shared
|
||||
from modules.logging_colors import logger
|
||||
|
||||
try:
|
||||
import flash_attn
|
||||
except Exception:
|
||||
logger.warning('Failed to load flash-attention due to the following error:\n')
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
class Exllamav2HF(PreTrainedModel, GenerationMixin):
|
||||
def __init__(self, config: ExLlamaV2Config):
|
||||
hf_config = PretrainedConfig.from_pretrained(config.model_dir)
|
||||
super().__init__(hf_config)
|
||||
|
||||
self.ex_config = config
|
||||
self.loras = None
|
||||
self.generation_config = GenerationConfig()
|
||||
|
||||
self.ex_model = ExLlamaV2(config)
|
||||
|
||||
split = None
|
||||
if shared.args.gpu_split:
|
||||
split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
|
||||
|
||||
if shared.args.enable_tp:
|
||||
self.ex_model.load_tp(split)
|
||||
elif not shared.args.autosplit:
|
||||
self.ex_model.load(split)
|
||||
|
||||
# Determine the correct cache type
|
||||
kv_cache_type = shared.args.cache_type.lower()
|
||||
|
||||
if kv_cache_type == 'fp16':
|
||||
cache_type = ExLlamaV2Cache
|
||||
elif kv_cache_type == 'fp8':
|
||||
cache_type = ExLlamaV2Cache_8bit
|
||||
elif kv_cache_type == 'q8':
|
||||
cache_type = ExLlamaV2Cache_Q8
|
||||
elif kv_cache_type == 'q6':
|
||||
cache_type = ExLlamaV2Cache_Q6
|
||||
elif kv_cache_type == 'q4':
|
||||
cache_type = ExLlamaV2Cache_Q4
|
||||
else:
|
||||
raise ValueError(f"Invalid cache type for ExLlamaV2: {kv_cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
|
||||
|
||||
# Use TP if specified
|
||||
if shared.args.enable_tp:
|
||||
self.ex_cache = ExLlamaV2Cache_TP(self.ex_model, base=cache_type)
|
||||
else:
|
||||
self.ex_cache = cache_type(self.ex_model, lazy=shared.args.autosplit)
|
||||
|
||||
if shared.args.autosplit and not shared.args.enable_tp:
|
||||
self.ex_model.load_autosplit(self.ex_cache)
|
||||
|
||||
self.past_seq = None
|
||||
if shared.args.cfg_cache:
|
||||
if shared.args.enable_tp:
|
||||
self.ex_cache_negative = ExLlamaV2Cache_TP(self.ex_model, base=cache_type)
|
||||
else:
|
||||
self.ex_cache_negative = cache_type(self.ex_model, lazy=shared.args.autosplit)
|
||||
|
||||
self.past_seq_negative = None
|
||||
|
||||
def _validate_model_class(self):
|
||||
pass
|
||||
|
||||
def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
|
||||
pass
|
||||
|
||||
def prepare_inputs_for_generation(self, input_ids, **kwargs):
|
||||
return {'input_ids': input_ids, **kwargs}
|
||||
|
||||
@property
|
||||
def device(self) -> torch.device:
|
||||
return torch.device(0)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
use_cache = kwargs.get('use_cache', True)
|
||||
labels = kwargs.get('labels', None)
|
||||
past_key_values = kwargs.get('past_key_values', None)
|
||||
|
||||
if len(args) > 0:
|
||||
if not shared.args.cfg_cache:
|
||||
logger.error("Please enable the cfg-cache option to use CFG with ExLlamav2_HF.")
|
||||
return
|
||||
|
||||
input_ids = args[0]
|
||||
is_negative = True
|
||||
past_seq = self.past_seq_negative
|
||||
ex_cache = self.ex_cache_negative
|
||||
else:
|
||||
input_ids = kwargs['input_ids']
|
||||
is_negative = False
|
||||
past_seq = self.past_seq
|
||||
ex_cache = self.ex_cache
|
||||
|
||||
seq = input_ids[0].tolist()
|
||||
if is_negative and past_key_values is not None:
|
||||
seq = past_key_values + seq
|
||||
|
||||
seq_tensor = torch.tensor(seq)
|
||||
reset = True
|
||||
|
||||
# Make the forward call
|
||||
if labels is None:
|
||||
if past_seq is not None:
|
||||
min_length = min(past_seq.shape[0], seq_tensor.shape[0])
|
||||
indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
|
||||
if len(indices) > 0:
|
||||
longest_prefix = indices[0].item()
|
||||
else:
|
||||
longest_prefix = min_length
|
||||
|
||||
if longest_prefix > 0:
|
||||
reset = False
|
||||
ex_cache.current_seq_len = longest_prefix
|
||||
if len(seq_tensor) - longest_prefix > 1:
|
||||
self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)
|
||||
elif len(seq_tensor) == longest_prefix:
|
||||
# Very tricky: if the prefix we are reusing *is* the input_ids, then we have to back up the cache pointer by one,
|
||||
# because we feed input_ids[-1] to forward() below, but that last token is already in the cache!
|
||||
ex_cache.current_seq_len -= 1
|
||||
|
||||
if reset:
|
||||
ex_cache.current_seq_len = 0
|
||||
if len(seq_tensor) > 1:
|
||||
self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)
|
||||
|
||||
logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache, loras=self.loras).to(input_ids.device).float()
|
||||
else:
|
||||
ex_cache.current_seq_len = 0
|
||||
logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False, loras=self.loras).float()
|
||||
|
||||
if is_negative:
|
||||
self.past_seq_negative = seq_tensor
|
||||
else:
|
||||
self.past_seq = seq_tensor
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
# Shift so that tokens < n predict n
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
# Flatten the tokens
|
||||
loss_fct = CrossEntropyLoss()
|
||||
shift_logits = shift_logits.view(-1, logits.shape[-1])
|
||||
shift_labels = shift_labels.view(-1)
|
||||
# Enable model parallelism
|
||||
shift_labels = shift_labels.to(shift_logits.device)
|
||||
loss = loss_fct(shift_logits, shift_labels)
|
||||
|
||||
return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
|
||||
assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
|
||||
if isinstance(pretrained_model_name_or_path, str):
|
||||
pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
|
||||
|
||||
pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
|
||||
|
||||
config = ExLlamaV2Config()
|
||||
config.model_dir = str(pretrained_model_name_or_path)
|
||||
config.prepare()
|
||||
|
||||
config.max_seq_len = shared.args.ctx_size
|
||||
config.scale_pos_emb = shared.args.compress_pos_emb
|
||||
config.scale_alpha_value = shared.args.alpha_value
|
||||
config.no_flash_attn = shared.args.no_flash_attn
|
||||
config.no_xformers = shared.args.no_xformers
|
||||
config.no_sdpa = shared.args.no_sdpa
|
||||
config.num_experts_per_token = int(shared.args.num_experts_per_token)
|
||||
|
||||
return Exllamav2HF(config)
|
||||
|
|
@ -59,7 +59,7 @@ class Exllamav3Model:
|
|||
logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}")
|
||||
max_tokens = adjusted_tokens
|
||||
|
||||
# Parse cache type (ExLlamaV2 pattern)
|
||||
# Parse cache type
|
||||
cache_type = shared.args.cache_type.lower()
|
||||
cache_kwargs = {}
|
||||
if cache_type == 'fp16':
|
||||
|
|
|
|||
|
|
@ -74,38 +74,6 @@ loaders_and_params = OrderedDict({
|
|||
'enable_tp',
|
||||
'tp_backend',
|
||||
],
|
||||
'ExLlamav2_HF': [
|
||||
'ctx_size',
|
||||
'cache_type',
|
||||
'gpu_split',
|
||||
'alpha_value',
|
||||
'compress_pos_emb',
|
||||
'num_experts_per_token',
|
||||
'autosplit',
|
||||
'enable_tp',
|
||||
'no_flash_attn',
|
||||
'no_xformers',
|
||||
'no_sdpa',
|
||||
'cfg_cache',
|
||||
'no_use_fast',
|
||||
],
|
||||
'ExLlamav2': [
|
||||
'ctx_size',
|
||||
'cache_type',
|
||||
'gpu_split',
|
||||
'alpha_value',
|
||||
'compress_pos_emb',
|
||||
'num_experts_per_token',
|
||||
'autosplit',
|
||||
'enable_tp',
|
||||
'no_flash_attn',
|
||||
'no_xformers',
|
||||
'no_sdpa',
|
||||
'model_draft',
|
||||
'draft_max',
|
||||
'ctx_size_draft',
|
||||
'speculative_decoding_accordion',
|
||||
],
|
||||
'TensorRT-LLM': [
|
||||
'ctx_size',
|
||||
'cpp_runner',
|
||||
|
|
@ -222,56 +190,6 @@ loaders_samplers = {
|
|||
'grammar_string',
|
||||
'grammar_file_row',
|
||||
},
|
||||
'ExLlamav2_HF': {
|
||||
'temperature',
|
||||
'dynatemp_low',
|
||||
'dynatemp_high',
|
||||
'dynatemp_exponent',
|
||||
'smoothing_factor',
|
||||
'smoothing_curve',
|
||||
'min_p',
|
||||
'top_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'xtc_threshold',
|
||||
'xtc_probability',
|
||||
'epsilon_cutoff',
|
||||
'eta_cutoff',
|
||||
'tfs',
|
||||
'top_a',
|
||||
'top_n_sigma',
|
||||
'adaptive_target',
|
||||
'adaptive_decay',
|
||||
'dry_multiplier',
|
||||
'dry_allowed_length',
|
||||
'dry_base',
|
||||
'repetition_penalty',
|
||||
'frequency_penalty',
|
||||
'presence_penalty',
|
||||
'encoder_repetition_penalty',
|
||||
'no_repeat_ngram_size',
|
||||
'repetition_penalty_range',
|
||||
'guidance_scale',
|
||||
'mirostat_mode',
|
||||
'mirostat_tau',
|
||||
'mirostat_eta',
|
||||
'do_sample',
|
||||
'dynamic_temperature',
|
||||
'temperature_last',
|
||||
'auto_max_new_tokens',
|
||||
'ban_eos_token',
|
||||
'add_bos_token',
|
||||
'enable_thinking',
|
||||
'reasoning_effort',
|
||||
'skip_special_tokens',
|
||||
'seed',
|
||||
'sampler_priority',
|
||||
'custom_token_bans',
|
||||
'negative_prompt',
|
||||
'dry_sequence_breakers',
|
||||
'grammar_string',
|
||||
'grammar_file_row',
|
||||
},
|
||||
'ExLlamav3': {
|
||||
'temperature',
|
||||
'min_p',
|
||||
|
|
@ -292,41 +210,6 @@ loaders_samplers = {
|
|||
'seed',
|
||||
'skip_special_tokens',
|
||||
},
|
||||
'ExLlamav2': {
|
||||
'temperature',
|
||||
'dynatemp_low',
|
||||
'dynatemp_high',
|
||||
'dynatemp_exponent',
|
||||
'smoothing_factor',
|
||||
'min_p',
|
||||
'top_p',
|
||||
'top_k',
|
||||
'typical_p',
|
||||
'xtc_threshold',
|
||||
'xtc_probability',
|
||||
'tfs',
|
||||
'top_a',
|
||||
'dry_multiplier',
|
||||
'dry_allowed_length',
|
||||
'dry_base',
|
||||
'repetition_penalty',
|
||||
'frequency_penalty',
|
||||
'presence_penalty',
|
||||
'repetition_penalty_range',
|
||||
'mirostat_mode',
|
||||
'mirostat_tau',
|
||||
'mirostat_eta',
|
||||
'dynamic_temperature',
|
||||
'temperature_last',
|
||||
'auto_max_new_tokens',
|
||||
'ban_eos_token',
|
||||
'add_bos_token',
|
||||
'enable_thinking',
|
||||
'reasoning_effort',
|
||||
'skip_special_tokens',
|
||||
'custom_token_bans',
|
||||
'dry_sequence_breakers',
|
||||
},
|
||||
'llama.cpp': {
|
||||
'temperature',
|
||||
'dynatemp_low',
|
||||
|
|
|
|||
|
|
@ -70,18 +70,12 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
|
|||
from modules import sampler_hijack
|
||||
from modules.torch_utils import get_device
|
||||
|
||||
is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
|
||||
is_non_hf_exllamav3 = shared.model.__class__.__name__ == 'Exllamav3Model'
|
||||
|
||||
if not use_samplers:
|
||||
state = {'stream': True}
|
||||
|
||||
if use_samplers:
|
||||
if is_non_hf_exllamav2:
|
||||
# sampling is all done in C++ for exllama, so it is really hard to hijack
|
||||
logger.error("Sampler hijacking is not supported non-Huggingface loaders.")
|
||||
return 'Error: Sampler hijacking is not supported non-Huggingface loaders. Please disable the "Use samplers" option.', previous
|
||||
|
||||
state['max_new_tokens'] = 1
|
||||
state['auto_max_new_tokens'] = False
|
||||
state.setdefault('stream', True)
|
||||
|
|
@ -90,7 +84,7 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
|
|||
|
||||
scores = sampler_hijack.global_scores[-1]
|
||||
else:
|
||||
if is_non_hf_exllamav2 or is_non_hf_exllamav3:
|
||||
if is_non_hf_exllamav3:
|
||||
device = get_device()
|
||||
tokens = shared.tokenizer.encode(prompt)
|
||||
if device:
|
||||
|
|
|
|||
|
|
@ -20,8 +20,6 @@ def load_model(model_name, loader=None):
|
|||
'Transformers': transformers_loader,
|
||||
'ExLlamav3_HF': ExLlamav3_HF_loader,
|
||||
'ExLlamav3': ExLlamav3_loader,
|
||||
'ExLlamav2_HF': ExLlamav2_HF_loader,
|
||||
'ExLlamav2': ExLlamav2_loader,
|
||||
'TensorRT-LLM': TensorRT_LLM_loader,
|
||||
}
|
||||
|
||||
|
|
@ -109,19 +107,6 @@ def ExLlamav3_loader(model_name):
|
|||
return model, tokenizer
|
||||
|
||||
|
||||
def ExLlamav2_HF_loader(model_name):
|
||||
from modules.exllamav2_hf import Exllamav2HF
|
||||
|
||||
return Exllamav2HF.from_pretrained(model_name)
|
||||
|
||||
|
||||
def ExLlamav2_loader(model_name):
|
||||
from modules.exllamav2 import Exllamav2Model
|
||||
|
||||
model, tokenizer = Exllamav2Model.from_pretrained(model_name)
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def TensorRT_LLM_loader(model_name):
|
||||
try:
|
||||
from modules.tensorrt_llm import TensorRTLLMModel
|
||||
|
|
@ -141,8 +126,6 @@ def unload_model(keep_model_name=False):
|
|||
|
||||
if model_class_name in ['Exllamav3Model', 'Exllamav3HF']:
|
||||
shared.model.unload()
|
||||
elif model_class_name in ['Exllamav2Model', 'Exllamav2HF'] and hasattr(shared.model, 'unload'):
|
||||
shared.model.unload()
|
||||
|
||||
shared.model = shared.tokenizer = None
|
||||
shared.lora_names = []
|
||||
|
|
|
|||
|
|
@ -216,12 +216,8 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
|
|||
loader = 'llama.cpp'
|
||||
elif hf_quant_method == 'exl3':
|
||||
loader = 'ExLlamav3'
|
||||
elif hf_quant_method in ['exl2', 'gptq']:
|
||||
loader = 'ExLlamav2_HF'
|
||||
elif re.match(r'.*exl3', model_name.lower()):
|
||||
loader = 'ExLlamav3'
|
||||
elif re.match(r'.*exl2', model_name.lower()):
|
||||
loader = 'ExLlamav2_HF'
|
||||
else:
|
||||
loader = 'Transformers'
|
||||
|
||||
|
|
@ -255,7 +251,7 @@ def apply_model_settings_to_state(model, state):
|
|||
model_settings = get_model_metadata(model)
|
||||
if 'loader' in model_settings:
|
||||
loader = model_settings.pop('loader')
|
||||
if not ((loader == 'ExLlamav2_HF' and state['loader'] == 'ExLlamav2') or (loader == 'ExLlamav3_HF' and state['loader'] == 'ExLlamav3')):
|
||||
if not (loader == 'ExLlamav3_HF' and state['loader'] == 'ExLlamav3'):
|
||||
state['loader'] = loader
|
||||
|
||||
for k in model_settings:
|
||||
|
|
|
|||
|
|
@ -67,12 +67,12 @@ group.add_argument('--image-quant', type=str, default=None,
|
|||
|
||||
# Model loader
|
||||
group = parser.add_argument_group('Model loader')
|
||||
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.')
|
||||
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav3, TensorRT-LLM.')
|
||||
|
||||
# Cache
|
||||
group = parser.add_argument_group('Context and cache')
|
||||
group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens. llama.cpp: 0 = auto if gpu-layers is also -1.')
|
||||
group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
|
||||
group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
|
||||
|
||||
# Speculative decoding
|
||||
group = parser.add_argument_group('Speculative decoding')
|
||||
|
|
@ -127,18 +127,10 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for
|
|||
|
||||
# ExLlamaV3
|
||||
group = parser.add_argument_group('ExLlamaV3')
|
||||
group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
|
||||
group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) to split the model across GPUs.')
|
||||
group.add_argument('--tp-backend', type=str, default='native', help='The backend for tensor parallelism. Valid options: native, nccl. Default: native.')
|
||||
|
||||
# ExLlamaV2
|
||||
group = parser.add_argument_group('ExLlamaV2')
|
||||
group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
|
||||
group.add_argument('--autosplit', action='store_true', help='Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.')
|
||||
group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
|
||||
group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
|
||||
group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
|
||||
group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
|
||||
group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
|
||||
group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
|
||||
|
||||
# TensorRT-LLM
|
||||
group = parser.add_argument_group('TensorRT-LLM')
|
||||
|
|
@ -379,10 +371,6 @@ def fix_loader_name(name):
|
|||
return 'llama.cpp'
|
||||
elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
|
||||
return 'Transformers'
|
||||
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
|
||||
return 'ExLlamav2'
|
||||
elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
|
||||
return 'ExLlamav2_HF'
|
||||
elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']:
|
||||
return 'ExLlamav3_HF'
|
||||
elif name in ['exllamav3']:
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
|
|||
yield ''
|
||||
return
|
||||
|
||||
if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']:
|
||||
if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav3Model', 'TensorRTLLMModel']:
|
||||
generate_func = generate_reply_custom
|
||||
else:
|
||||
generate_func = generate_reply_HF
|
||||
|
|
@ -128,9 +128,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
|
|||
|
||||
from modules.torch_utils import get_device
|
||||
|
||||
if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']:
|
||||
if shared.model.__class__.__name__ in ['Exllamav3Model', 'TensorRTLLMModel']:
|
||||
input_ids = shared.tokenizer.encode(str(prompt))
|
||||
if shared.model.__class__.__name__ not in ['Exllamav2Model', 'Exllamav3Model']:
|
||||
if shared.model.__class__.__name__ not in ['Exllamav3Model']:
|
||||
input_ids = np.array(input_ids).reshape(1, len(input_ids))
|
||||
else:
|
||||
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
|
||||
|
|
@ -148,7 +148,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
|
|||
if truncation_length is not None:
|
||||
input_ids = input_ids[:, -truncation_length:]
|
||||
|
||||
if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel'] or shared.args.cpu:
|
||||
if shared.model.__class__.__name__ in ['Exllamav3Model', 'TensorRTLLMModel'] or shared.args.cpu:
|
||||
return input_ids
|
||||
else:
|
||||
device = get_device()
|
||||
|
|
|
|||
|
|
@ -141,7 +141,6 @@ def list_model_elements():
|
|||
'compress_pos_emb',
|
||||
'compute_dtype',
|
||||
'quant_type',
|
||||
'num_experts_per_token',
|
||||
'load_in_8bit',
|
||||
'load_in_4bit',
|
||||
'attn_implementation',
|
||||
|
|
@ -154,12 +153,8 @@ def list_model_elements():
|
|||
'numa',
|
||||
'use_double_quant',
|
||||
'bf16',
|
||||
'autosplit',
|
||||
'enable_tp',
|
||||
'tp_backend',
|
||||
'no_flash_attn',
|
||||
'no_xformers',
|
||||
'no_sdpa',
|
||||
'cfg_cache',
|
||||
'cpp_runner',
|
||||
'no_use_fast',
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ def create_ui():
|
|||
shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=0, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. llama.cpp: 0 = auto if gpu-layers is also -1. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
|
||||
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
||||
shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
|
||||
shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
|
||||
shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
|
||||
shared.gradio['tp_backend'] = gr.Dropdown(label="tp-backend", choices=['native', 'nccl'], value=shared.args.tp_backend, info='The backend for tensor parallelism.')
|
||||
|
||||
with gr.Column():
|
||||
|
|
@ -55,7 +55,6 @@ def create_ui():
|
|||
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
|
||||
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
|
||||
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
|
||||
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
|
||||
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).')
|
||||
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
|
||||
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
|
||||
|
|
@ -101,7 +100,6 @@ def create_ui():
|
|||
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
|
||||
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
|
||||
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
|
||||
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
|
||||
|
||||
with gr.Column():
|
||||
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='Use PyTorch in CPU mode.')
|
||||
|
|
@ -112,9 +110,6 @@ def create_ui():
|
|||
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
|
||||
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
|
||||
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
|
||||
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
|
||||
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
|
||||
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
|
||||
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
|
||||
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
|
||||
if not shared.args.portable:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue