From 8cdb911a6e637c355dc9eac2ab43f94eab7b3281 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 17 Aug 2025 22:06:12 -0700
Subject: [PATCH] Update README

---
 README.md | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)
diff --git a/README.md b/README.md
index ccde678a..f213f7a9 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,6 @@
 
 A Gradio web UI for Large Language Models.
 
-Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
-
 [Try the Deep Reason extension](https://oobabooga.gumroad.com/l/deep_reason)
 
 |![Image1](https://github.com/oobabooga/screenshots/raw/main/INSTRUCT-3.5.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/CHAT-3.5.png) |
@@ -79,16 +77,11 @@ For users who need additional backends (ExLlamaV3, Transformers) or extensions (
 
 To restart the web UI later, run the same `start_` script.
 
-To reinstall with a fresh Python environment, delete the `installer_files` folder and run the `start_` script again.
-
 You can pass command-line flags directly (e.g., `./start_linux.sh --help`), or add them to `user_data/CMD_FLAGS.txt` (e.g., `--api` to enable the API).
 
 To update, run the update script for your OS: `update_wizard_windows.bat`, `update_wizard_linux.sh`, or `update_wizard_macos.sh`.
 
-<details>
-<summary>
-Manual portable installation with venv
-</summary>
+To reinstall with a fresh Python environment, delete the `installer_files` folder and run the `start_` script again.
 
 <summary>
 One-click installer details
@@ -236,13 +229,13 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [-
                  [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
                  [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant]
                  [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock]
-                 [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
-                 [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
-                 [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
-                 [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
-                 [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
-                 [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY]
-                 [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
+                 [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--mmproj MMPROJ] [--ctx-size N] [--cache-type N]
+                 [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--enable-tp]
+                 [--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner] [--deepspeed]
+                 [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen]
+                 [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE]
+                 [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY]
+                 [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
 
 Text generation web UI
 
@@ -299,6 +292,7 @@ llama.cpp:
   --row-split                               Split the model by rows across GPUs. This may improve multi-gpu performance.
   --extra-flags EXTRA_FLAGS                 Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
   --streaming-llm                           Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
+  --mmproj MMPROJ                           Path to the mmproj file for vision models.
 
 Context and cache:
   --ctx-size N, --n_ctx N, --max_seq_len N  Context size in tokens.
@@ -312,6 +306,10 @@ Speculative decoding:
   --device-draft DEVICE_DRAFT               Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1
   --ctx-size-draft CTX_SIZE_DRAFT           Size of the prompt context for the draft model. If 0, uses the same as the main model.
 
+ExLlamaV3:
+  --enable-tp, --enable_tp                  Enable Tensor Parallelism (TP) to split the model across GPUs.
+  --tp-backend TP_BACKEND                   The backend for tensor parallelism. Valid options: native, nccl. Default: native.
+
 ExLlamaV2:
   --gpu-split GPU_SPLIT                     Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
   --autosplit                               Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.
@@ -320,7 +318,6 @@ ExLlamaV2:
   --no_xformers                             Force xformers to not be used.
   --no_sdpa                                 Force Torch SDPA to not be used.
   --num_experts_per_token N                 Number of experts to use for generation. Applies to MoE models like Mixtral.
-  --enable_tp                               Enable Tensor Parallelism (TP) in ExLlamaV2.
 
 TensorRT-LLM:
   --cpp-runner                              Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.