diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 8579f843..6a094c9d 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -318,8 +318,6 @@ class LlamaServer: "--no-webui", ] - if shared.args.flash_attn: - cmd.append("--flash-attn") if shared.args.threads > 0: cmd += ["--threads", str(shared.args.threads)] if shared.args.threads_batch > 0: diff --git a/modules/loaders.py b/modules/loaders.py index f88e976d..fe982ab5 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -16,7 +16,6 @@ loaders_and_params = OrderedDict({ 'streaming_llm', 'rope_freq_base', 'compress_pos_emb', - 'flash_attn', 'row_split', 'no_kv_offload', 'no_mmap', diff --git a/modules/shared.py b/modules/shared.py index a3085239..4daf43c9 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -73,7 +73,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for # llama.cpp group = parser.add_argument_group('llama.cpp') -group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.') group.add_argument('--threads', type=int, default=0, help='Number of threads to use.') group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') @@ -159,9 +158,6 @@ group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 f group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API') group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.') -# Deprecated parameters -group = parser.add_argument_group('Deprecated') - # Handle CMD_FLAGS.txt cmd_flags_path = Path(__file__).parent.parent / "user_data" / "CMD_FLAGS.txt" if cmd_flags_path.exists(): diff --git a/modules/ui.py b/modules/ui.py index 502005e7..12f43768 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -142,7 +142,6 @@ def list_model_elements(): 'num_experts_per_token', 'load_in_8bit', 'load_in_4bit', - 'flash_attn', 'attn_implementation', 'cpu', 'disk', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index dd240627..729700d4 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -50,7 +50,6 @@ def create_ui(): with gr.Column(): shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) - shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)