diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 7d79e516..952b73b8 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -3,6 +3,7 @@ import traceback from pathlib import Path import torch + from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, @@ -15,7 +16,6 @@ from exllamav2 import ( ExLlamaV2Tokenizer ) from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator - from modules import shared from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length @@ -40,7 +40,7 @@ class Exllamav2Model: config.model_dir = str(path_to_model) config.prepare() - config.max_seq_len = shared.args.max_seq_len + config.max_seq_len = shared.args.ctx_size config.scale_pos_emb = shared.args.compress_pos_emb config.scale_alpha_value = shared.args.alpha_value config.no_flash_attn = shared.args.no_flash_attn diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index b159d9ce..d6c3bf6e 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -4,6 +4,15 @@ from pathlib import Path from typing import Any, Dict, Optional, Union import torch +from torch.nn import CrossEntropyLoss +from transformers import ( + GenerationConfig, + GenerationMixin, + PretrainedConfig, + PreTrainedModel +) +from transformers.modeling_outputs import CausalLMOutputWithPast + from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, @@ -14,15 +23,6 @@ from exllamav2 import ( ExLlamaV2Cache_TP, ExLlamaV2Config ) -from torch.nn import CrossEntropyLoss -from transformers import ( - GenerationConfig, - GenerationMixin, - PretrainedConfig, - PreTrainedModel -) -from transformers.modeling_outputs import CausalLMOutputWithPast - from modules import shared from modules.logging_colors import logger @@ -192,7 +192,7 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin): config.model_dir = str(pretrained_model_name_or_path) config.prepare() - config.max_seq_len = shared.args.max_seq_len + config.max_seq_len = shared.args.ctx_size config.scale_pos_emb = shared.args.compress_pos_emb config.scale_alpha_value = shared.args.alpha_value config.no_flash_attn = shared.args.no_flash_attn diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index 2d9c493a..24ba9e13 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -33,7 +33,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): self.ex_model = Model.from_config(config) # Calculate the closest multiple of 256 at or above the chosen value - max_tokens = shared.args.max_seq_len + max_tokens = shared.args.ctx_size if max_tokens % 256 != 0: adjusted_tokens = ((max_tokens // 256) + 1) * 256 logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}") diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 85743705..fb972a32 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -254,7 +254,7 @@ class LlamaServer: cmd = [ self.server_path, "--model", self.model_path, - "--ctx-size", str(shared.args.n_ctx), + "--ctx-size", str(shared.args.ctx_size), "--n-gpu-layers", str(shared.args.n_gpu_layers), "--batch-size", str(shared.args.batch_size), "--port", str(self.port), diff --git a/modules/loaders.py b/modules/loaders.py index 9442a147..d8d62bf9 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -9,7 +9,7 @@ loaders_and_params = OrderedDict({ 'threads', 'threads_batch', 'batch_size', - 'n_ctx', + 'ctx_size', 'cache_type', 'tensor_split', 'extra_flags', @@ -48,14 +48,14 @@ loaders_and_params = OrderedDict({ 'no_use_fast', ], 'ExLlamav3_HF': [ - 'max_seq_len', + 'ctx_size', 'gpu_split', 'cfg_cache', 'trust_remote_code', 'no_use_fast', ], 'ExLlamav2_HF': [ - 'max_seq_len', + 'ctx_size', 'cache_type', 'gpu_split', 'alpha_value', @@ -71,7 +71,7 @@ loaders_and_params = OrderedDict({ 'no_use_fast', ], 'ExLlamav2': [ - 'max_seq_len', + 'ctx_size', 'cache_type', 'gpu_split', 'alpha_value', @@ -93,7 +93,7 @@ loaders_and_params = OrderedDict({ 'no_use_fast', ], 'TensorRT-LLM': [ - 'max_seq_len', + 'ctx_size', 'cpp_runner', 'tensorrt_llm_info', ] diff --git a/modules/models.py b/modules/models.py index 99b068aa..d0b0402a 100644 --- a/modules/models.py +++ b/modules/models.py @@ -52,10 +52,8 @@ def load_model(model_name, loader=None): tokenizer = load_tokenizer(model_name) shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) - if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt'): - shared.settings['truncation_length'] = shared.args.max_seq_len - elif loader == 'llama.cpp': - shared.settings['truncation_length'] = shared.args.n_ctx + if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp': + shared.settings['truncation_length'] = shared.args.ctx_size logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.") logger.info(f"LOADER: \"{loader}\"") diff --git a/modules/models_settings.py b/modules/models_settings.py index ee2ed71b..d3ecd51f 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -11,8 +11,7 @@ def get_fallback_settings(): return { 'bf16': False, 'use_eager_attention': False, - 'max_seq_len': 2048, - 'n_ctx': 2048, + 'ctx_size': 2048, 'rope_freq_base': 0, 'compress_pos_emb': 1, 'alpha_value': 1, @@ -59,7 +58,7 @@ def get_model_metadata(model): for k in metadata: if k.endswith('context_length'): - model_settings['n_ctx'] = min(metadata[k], 8192) + model_settings['ctx_size'] = min(metadata[k], 8192) model_settings['truncation_length_info'] = metadata[k] elif k.endswith('rope.freq_base'): model_settings['rope_freq_base'] = metadata[k] @@ -97,7 +96,7 @@ def get_model_metadata(model): if k in metadata: model_settings['truncation_length'] = metadata[k] model_settings['truncation_length_info'] = metadata[k] - model_settings['max_seq_len'] = min(metadata[k], 8192) + model_settings['ctx_size'] = min(metadata[k], 8192) if 'rope_theta' in metadata: model_settings['rope_freq_base'] = metadata['rope_theta'] diff --git a/modules/shared.py b/modules/shared.py index c40f8e21..572bfc09 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -116,7 +116,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for # llama.cpp group = parser.add_argument_group('llama.cpp') group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.') -group.add_argument('--n_ctx', type=int, default=8192, help='Size of the prompt context.') group.add_argument('--threads', type=int, default=0, help='Number of threads to use.') group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') @@ -130,6 +129,11 @@ group.add_argument('--row-split', action='store_true', help='Split the model by group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"') group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') +# Cache +group = parser.add_argument_group('Context and cache management') +group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, help='Context size in tokens.') +group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.') + # Speculative decoding group = parser.add_argument_group('Speculative decoding') group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.') @@ -142,7 +146,6 @@ group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the pr group = parser.add_argument_group('ExLlamaV2') group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.') group.add_argument('--autosplit', action='store_true', help='Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.') -group.add_argument('--max_seq_len', type=int, default=8192, help='Maximum sequence length.') group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.') diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py index c2685b75..73178c39 100644 --- a/modules/tensorrt_llm.py +++ b/modules/tensorrt_llm.py @@ -1,15 +1,15 @@ from pathlib import Path -import tensorrt_llm import torch -from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp +import tensorrt_llm from modules import shared from modules.logging_colors import logger from modules.text_generation import ( get_max_prompt_length, get_reply_from_output_ids ) +from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp class TensorRTLLMModel: @@ -35,7 +35,7 @@ class TensorRTLLMModel: logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"") runner_kwargs.update( max_batch_size=1, - max_input_len=shared.args.max_seq_len - 512, + max_input_len=shared.args.ctx_size - 512, max_output_len=512, max_beam_width=1, max_attention_window_size=None, diff --git a/modules/ui.py b/modules/ui.py index 3e1bf6d8..68cb76a6 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -110,8 +110,7 @@ def list_model_elements(): 'threads_batch', 'batch_size', 'hqq_backend', - 'n_ctx', - 'max_seq_len', + 'ctx_size', 'cache_type', 'tensor_split', 'extra_flags', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 1460dfec..9aeb02d1 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -51,8 +51,7 @@ def create_ui(): shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) - shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768.') - shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768.') + shared.gradio['ctx_size'] = gr.Number(label='ctx_size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.') shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.') shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') @@ -92,7 +91,7 @@ def create_ui(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") - shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') + shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') # Speculative decoding with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']: @@ -247,10 +246,8 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur def update_truncation_length(current_length, state): if 'loader' in state: - if state['loader'].lower().startswith('exllama'): - return state['max_seq_len'] - elif state['loader'] == 'llama.cpp': - return state['n_ctx'] + if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp': + return state['ctx_size'] return current_length diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index b494a758..156e4128 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -121,10 +121,8 @@ def create_event_handlers(): def get_truncation_length(): - if 'max_seq_len' in shared.provided_arguments or shared.args.max_seq_len != shared.args_defaults.max_seq_len: - return shared.args.max_seq_len - elif 'n_ctx' in shared.provided_arguments or shared.args.n_ctx != shared.args_defaults.n_ctx: - return shared.args.n_ctx + if 'ctx_size' in shared.provided_arguments or shared.args.ctx_size != shared.args_defaults.ctx_size: + return shared.args.ctx_size else: return shared.settings['truncation_length']