llama.cpp: Remove the --flash-attn flag (it's always on now)

This commit is contained in:
oobabooga 2025-08-30 20:27:32 -07:00
parent 7b80e9a2ad
commit 13876a1ee8
5 changed files with 0 additions and 9 deletions

View file

@ -318,8 +318,6 @@ class LlamaServer:
"--no-webui",
]
if shared.args.flash_attn:
cmd.append("--flash-attn")
if shared.args.threads > 0:
cmd += ["--threads", str(shared.args.threads)]
if shared.args.threads_batch > 0:

View file

@ -16,7 +16,6 @@ loaders_and_params = OrderedDict({
'streaming_llm',
'rope_freq_base',
'compress_pos_emb',
'flash_attn',
'row_split',
'no_kv_offload',
'no_mmap',

View file

@ -73,7 +73,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for
# llama.cpp
group = parser.add_argument_group('llama.cpp')
group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.')
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
@ -159,9 +158,6 @@ group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 f
group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API')
group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')
# Deprecated parameters
group = parser.add_argument_group('Deprecated')
# Handle CMD_FLAGS.txt
cmd_flags_path = Path(__file__).parent.parent / "user_data" / "CMD_FLAGS.txt"
if cmd_flags_path.exists():

View file

@ -142,7 +142,6 @@ def list_model_elements():
'num_experts_per_token',
'load_in_8bit',
'load_in_4bit',
'flash_attn',
'attn_implementation',
'cpu',
'disk',

View file

@ -50,7 +50,6 @@ def create_ui():
with gr.Column():
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)