mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-01-10 02:30:58 +01:00
Update README
This commit is contained in:
parent
bd4881c4dc
commit
e015355e4a
|
|
@ -235,7 +235,7 @@ List of command-line flags
|
|||
```txt
|
||||
usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
|
||||
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
|
||||
[--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
|
||||
[--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--torch-compile] [--load-in-4bit]
|
||||
[--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
|
||||
[--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
|
||||
[--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
|
||||
|
|
@ -278,8 +278,7 @@ Transformers/Accelerate:
|
|||
--trust-remote-code Set trust_remote_code=True while loading the model. Necessary for some models.
|
||||
--force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
|
||||
--no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
|
||||
--use_flash_attention_2 Set use_flash_attention_2=True while loading the model.
|
||||
--use_eager_attention Set attn_implementation= eager while loading the model.
|
||||
--attn-implementation IMPLEMENTATION Attention implementation. Valid options: eager, sdpa, flash_attention_2.
|
||||
--torch-compile Compile the model with torch.compile for improved performance.
|
||||
|
||||
bitsandbytes 4-bit:
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ group.add_argument('--no-cache', action='store_true', help='Set use_cache to Fal
|
|||
group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
|
||||
group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
|
||||
group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
|
||||
group.add_argument('--attn-implementation', type=str, default='eager', help='Attention implementation. Valid options: eager, sdpa, flash_attention_2.')
|
||||
group.add_argument('--attn-implementation', type=str, default='eager', metavar="IMPLEMENTATION", help='Attention implementation. Valid options: eager, sdpa, flash_attention_2.')
|
||||
group.add_argument('--torch-compile', action='store_true', help='Compile the model with torch.compile for improved performance.')
|
||||
|
||||
# bitsandbytes 4-bit
|
||||
|
|
|
|||
Loading…
Reference in a new issue