mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-03-10 23:53:49 +01:00
llama.cpp: Remove the --flash-attn flag (it's always on now)
This commit is contained in:
parent
7b80e9a2ad
commit
13876a1ee8
|
|
@ -318,8 +318,6 @@ class LlamaServer:
|
|||
"--no-webui",
|
||||
]
|
||||
|
||||
if shared.args.flash_attn:
|
||||
cmd.append("--flash-attn")
|
||||
if shared.args.threads > 0:
|
||||
cmd += ["--threads", str(shared.args.threads)]
|
||||
if shared.args.threads_batch > 0:
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@ loaders_and_params = OrderedDict({
|
|||
'streaming_llm',
|
||||
'rope_freq_base',
|
||||
'compress_pos_emb',
|
||||
'flash_attn',
|
||||
'row_split',
|
||||
'no_kv_offload',
|
||||
'no_mmap',
|
||||
|
|
|
|||
|
|
@ -73,7 +73,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for
|
|||
|
||||
# llama.cpp
|
||||
group = parser.add_argument_group('llama.cpp')
|
||||
group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.')
|
||||
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
|
||||
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
|
||||
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
|
||||
|
|
@ -159,9 +158,6 @@ group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 f
|
|||
group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API')
|
||||
group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')
|
||||
|
||||
# Deprecated parameters
|
||||
group = parser.add_argument_group('Deprecated')
|
||||
|
||||
# Handle CMD_FLAGS.txt
|
||||
cmd_flags_path = Path(__file__).parent.parent / "user_data" / "CMD_FLAGS.txt"
|
||||
if cmd_flags_path.exists():
|
||||
|
|
|
|||
|
|
@ -142,7 +142,6 @@ def list_model_elements():
|
|||
'num_experts_per_token',
|
||||
'load_in_8bit',
|
||||
'load_in_4bit',
|
||||
'flash_attn',
|
||||
'attn_implementation',
|
||||
'cpu',
|
||||
'disk',
|
||||
|
|
|
|||
|
|
@ -50,7 +50,6 @@ def create_ui():
|
|||
|
||||
with gr.Column():
|
||||
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
|
||||
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
|
||||
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
||||
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
|
||||
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
|
||||
|
|
|
|||
Loading…
Reference in a new issue