Update README

2026-02-24 16:46:36 +01:00 · 2025-07-09 20:03:53 -07:00 · 2025-07-09 20:03:53 -07:00 · e015355e4a
parent bd4881c4dc
commit e015355e4a
2 changed files with 3 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -235,7 +235,7 @@ List of command-line flags
 ```txt
 usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
                 [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
-                 [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
+                 [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--torch-compile] [--load-in-4bit]
                 [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
                 [--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
                 [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
@ -278,8 +278,7 @@ Transformers/Accelerate:
  --trust-remote-code                       Set trust_remote_code=True while loading the model. Necessary for some models.
  --force-safetensors                       Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
  --no_use_fast                             Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
-  --use_flash_attention_2                   Set use_flash_attention_2=True while loading the model.
-  --use_eager_attention                     Set attn_implementation= eager while loading the model.
+  --attn-implementation IMPLEMENTATION      Attention implementation. Valid options: eager, sdpa, flash_attention_2.
  --torch-compile                           Compile the model with torch.compile for improved performance.

 bitsandbytes 4-bit:
--- a/modules/shared.py
+++ b/modules/shared.py
@ -61,7 +61,7 @@ group.add_argument('--no-cache', action='store_true', help='Set use_cache to Fal
 group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
 group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
 group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
-group.add_argument('--attn-implementation', type=str, default='eager', help='Attention implementation. Valid options: eager, sdpa, flash_attention_2.')
+group.add_argument('--attn-implementation', type=str, default='eager', metavar="IMPLEMENTATION", help='Attention implementation. Valid options: eager, sdpa, flash_attention_2.')
 group.add_argument('--torch-compile', action='store_true', help='Compile the model with torch.compile for improved performance.')

 # bitsandbytes 4-bit