mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-02-02 13:54:40 +01:00
Update README
This commit is contained in:
parent
6bf31479d9
commit
8cdb911a6e
29
README.md
29
README.md
|
|
@ -2,8 +2,6 @@
|
|||
|
||||
A Gradio web UI for Large Language Models.
|
||||
|
||||
Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
|
||||
|
||||
[Try the Deep Reason extension](https://oobabooga.gumroad.com/l/deep_reason)
|
||||
|
||||
| |  |
|
||||
|
|
@ -79,16 +77,11 @@ For users who need additional backends (ExLlamaV3, Transformers) or extensions (
|
|||
|
||||
To restart the web UI later, run the same `start_` script.
|
||||
|
||||
To reinstall with a fresh Python environment, delete the `installer_files` folder and run the `start_` script again.
|
||||
|
||||
You can pass command-line flags directly (e.g., `./start_linux.sh --help`), or add them to `user_data/CMD_FLAGS.txt` (e.g., `--api` to enable the API).
|
||||
|
||||
To update, run the update script for your OS: `update_wizard_windows.bat`, `update_wizard_linux.sh`, or `update_wizard_macos.sh`.
|
||||
|
||||
<details>
|
||||
<summary>
|
||||
Manual portable installation with venv
|
||||
</summary>
|
||||
To reinstall with a fresh Python environment, delete the `installer_files` folder and run the `start_` script again.
|
||||
|
||||
<summary>
|
||||
One-click installer details
|
||||
|
|
@ -236,13 +229,13 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [-
|
|||
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
|
||||
[--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant]
|
||||
[--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock]
|
||||
[--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
|
||||
[--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
|
||||
[--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
|
||||
[--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
|
||||
[--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
|
||||
[--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY]
|
||||
[--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
|
||||
[--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--mmproj MMPROJ] [--ctx-size N] [--cache-type N]
|
||||
[--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--enable-tp]
|
||||
[--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner] [--deepspeed]
|
||||
[--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen]
|
||||
[--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE]
|
||||
[--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY]
|
||||
[--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
|
||||
|
||||
Text generation web UI
|
||||
|
||||
|
|
@ -299,6 +292,7 @@ llama.cpp:
|
|||
--row-split Split the model by rows across GPUs. This may improve multi-gpu performance.
|
||||
--extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
|
||||
--streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
|
||||
--mmproj MMPROJ Path to the mmproj file for vision models.
|
||||
|
||||
Context and cache:
|
||||
--ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens.
|
||||
|
|
@ -312,6 +306,10 @@ Speculative decoding:
|
|||
--device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1
|
||||
--ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model.
|
||||
|
||||
ExLlamaV3:
|
||||
--enable-tp, --enable_tp Enable Tensor Parallelism (TP) to split the model across GPUs.
|
||||
--tp-backend TP_BACKEND The backend for tensor parallelism. Valid options: native, nccl. Default: native.
|
||||
|
||||
ExLlamaV2:
|
||||
--gpu-split GPU_SPLIT Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
|
||||
--autosplit Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.
|
||||
|
|
@ -320,7 +318,6 @@ ExLlamaV2:
|
|||
--no_xformers Force xformers to not be used.
|
||||
--no_sdpa Force Torch SDPA to not be used.
|
||||
--num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral.
|
||||
--enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2.
|
||||
|
||||
TensorRT-LLM:
|
||||
--cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
|
||||
|
|
|
|||
Loading…
Reference in a new issue