From 8144e1031e0f311a69092e75f6517654c7965617 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 18 Apr 2025 06:02:28 -0700 Subject: [PATCH] Remove deprecated command-line flags --- modules/loaders.py | 1 - modules/shared.py | 76 ---------------------------------------- modules/ui.py | 1 - modules/ui_model_menu.py | 1 - 4 files changed, 79 deletions(-) diff --git a/modules/loaders.py b/modules/loaders.py index 3060406d..aaf4c8ed 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -39,7 +39,6 @@ loaders_and_params = OrderedDict({ 'flash_attn', 'row_split', 'no_kv_offload', - 'no_mul_mat_q', 'no_mmap', 'mlock', 'numa', diff --git a/modules/shared.py b/modules/shared.py index 83761b75..9e525bd9 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -192,26 +192,6 @@ group.add_argument('--nowebui', action='store_true', help='Do not launch the Gra # Deprecated parameters group = parser.add_argument_group('Deprecated') -group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED') -group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED') -group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED') -group.add_argument('--triton', action='store_true', help='DEPRECATED') -group.add_argument('--no_inject_fused_mlp', action='store_true', help='DEPRECATED') -group.add_argument('--no_use_cuda_fp16', action='store_true', help='DEPRECATED') -group.add_argument('--desc_act', action='store_true', help='DEPRECATED') -group.add_argument('--disable_exllama', action='store_true', help='DEPRECATED') -group.add_argument('--disable_exllamav2', action='store_true', help='DEPRECATED') -group.add_argument('--wbits', type=int, default=0, help='DEPRECATED') -group.add_argument('--groupsize', type=int, default=-1, help='DEPRECATED') -group.add_argument('--model-menu', action='store_true', help='DEPRECATED') -group.add_argument('--multimodal-pipeline', type=str, default=None, help='DEPRECATED') -group.add_argument('--streaming-llm', action='store_true', help='DEPRECATED') -group.add_argument('--attention-sink-size', type=int, default=5, help='DEPRECATED') -group.add_argument('--tokenizer-dir', type=str, help='DEPRECATED') -group.add_argument('--logits_all', action='store_true', help='DEPRECATED') -group.add_argument('--no_mul_mat_q', action='store_true', help='DEPRECATED') -group.add_argument('--cache-capacity', type=str, help='DEPRECATED') -group.add_argument('--tensorcores', action='store_true', help='DEPRECATED') args = parser.parse_args() args_defaults = parser.parse_args([]) @@ -276,58 +256,6 @@ def fix_loader_name(name): return 'TensorRT-LLM' -def transform_legacy_kv_cache_options(opts): - # Handle both argparse.Namespace and dict here - def get(key): - return opts.get(key) if isinstance(opts, dict) else getattr(opts, key, None) - - def set(key, value): - if isinstance(opts, dict): - opts[key] = value - else: - setattr(opts, key, value) - - def del_key(key, fallback_set): - # only remove from user dict, can't delete from argparse.Namespace - if type(opts) is dict: - if key in opts: - del opts[key] - else: - setattr(opts, key, fallback_set) - - # Retrieve values - loader = get('loader') - cache_8bit = get('cache_8bit') - cache_4bit = get('cache_4bit') - - # Determine cache type based on loader or legacy flags - if cache_8bit or cache_4bit: - if not loader: - # Legacy behavior: prefer 8-bit over 4-bit to minimize breakage - if cache_8bit: - set('cache_type', 'fp8') - elif cache_4bit: - set('cache_type', 'q4') - elif loader.lower() in ['exllamav2', 'exllamav2_hf']: - # ExLlamaV2 loader-specific cache type - if cache_8bit: - set('cache_type', 'fp8') - elif cache_4bit: - set('cache_type', 'q4') - elif loader.lower() == 'llama.cpp': - # Llama.cpp loader-specific cache type - if cache_4bit: - set('cache_type', 'q4_0') - elif cache_8bit: - set('cache_type', 'q8_0') - - # Clean up legacy keys - del_key('cache_4bit', False) - del_key('cache_8bit', False) - - return opts - - def add_extension(name, last=False): if args.extensions is None: args.extensions = [name] @@ -356,14 +284,10 @@ def load_user_config(): else: user_config = {} - for model_name in user_config: - user_config[model_name] = transform_legacy_kv_cache_options(user_config[model_name]) - return user_config args.loader = fix_loader_name(args.loader) -args = transform_legacy_kv_cache_options(args) # Activate the API extension if args.api or args.public_api: diff --git a/modules/ui.py b/modules/ui.py index 919ab8da..c36fe553 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -133,7 +133,6 @@ def list_model_elements(): 'disk', 'row_split', 'no_kv_offload', - 'no_mul_mat_q', 'no_mmap', 'mlock', 'numa', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index fca883eb..cdc0e77c 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -112,7 +112,6 @@ def create_ui(): shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.') shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') - shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.') shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')