From 1ffe540c9710f184de421fe83ad53bb1f784841d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 5 Mar 2026 12:46:21 -0300 Subject: [PATCH] Full documentation update to match current codebase --- README.md | 219 +++++++++++++------------ docs/01 - Chat Tab.md | 49 +++--- docs/02 - Default and Notebook Tabs.md | 6 +- docs/03 - Parameters Tab.md | 71 +++++--- docs/04 - Model Tab.md | 157 +++++++++--------- docs/06 - Session Tab.md | 13 +- docs/07 - Extensions.md | 25 +-- docs/11 - AMD Setup.md | 26 ++- docs/12 - OpenAI API.md | 121 ++++++-------- docs/What Works.md | 27 ++- 10 files changed, 388 insertions(+), 326 deletions(-) diff --git a/README.md b/README.md index a72e8060..a2ba87f8 100644 --- a/README.md +++ b/README.md @@ -239,130 +239,150 @@ List of command-line flags ```txt usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS] - [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--ctx-size N] [--cache-type N] [--model-draft MODEL_DRAFT] - [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-layers N] [--mmproj MMPROJ] [--streaming-llm] - [--tensor-split TENSOR_SPLIT] [--row-split] [--no-mmap] [--mlock] [--no-kv-offload] [--batch-size BATCH_SIZE] [--threads THREADS] [--threads-batch THREADS_BATCH] [--numa] + [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--image-model IMAGE_MODEL] [--image-model-dir IMAGE_MODEL_DIR] [--image-dtype {bfloat16,float16}] + [--image-attn-backend {flash_attention_2,sdpa}] [--image-cpu-offload] [--image-compile] [--image-quant {none,bnb-8bit,bnb-4bit,torchao-int8wo,torchao-fp4,torchao-float8wo}] + [--loader LOADER] [--ctx-size N] [--cache-type N] [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] + [--ctx-size-draft CTX_SIZE_DRAFT] [--spec-type {none,ngram-mod,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-cache}] [--spec-ngram-size-n SPEC_NGRAM_SIZE_N] + [--spec-ngram-size-m SPEC_NGRAM_SIZE_M] [--spec-ngram-min-hits SPEC_NGRAM_MIN_HITS] [--gpu-layers N] [--cpu-moe] [--mmproj MMPROJ] [--streaming-llm] [--tensor-split TENSOR_SPLIT] + [--row-split] [--no-mmap] [--mlock] [--no-kv-offload] [--batch-size BATCH_SIZE] [--ubatch-size UBATCH_SIZE] [--threads THREADS] [--threads-batch THREADS_BATCH] [--numa] [--extra-flags EXTRA_FLAGS] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--enable-tp] [--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner] - [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] - [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] - [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] - [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] + [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] + [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] + [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] + [--nowebui] Text Generation Web UI options: - -h, --help show this help message and exit + -h, --help show this help message and exit Basic settings: - --multi-user Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly. - --model MODEL Name of the model to load by default. - --lora LORA [LORA ...] The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces. - --model-dir MODEL_DIR Path to directory with all the models. - --lora-dir LORA_DIR Path to directory with all the loras. - --model-menu Show a model menu in the terminal when the web UI is first launched. - --settings SETTINGS Load the default interface settings from this yaml file. See user_data/settings-template.yaml for an example. If you create a file called - user_data/settings.yaml, this file will be loaded by default without the need to use the --settings flag. - --extensions EXTENSIONS [EXTENSIONS ...] The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. - --verbose Print the prompts to the terminal. - --idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again. + --multi-user Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly. + --model MODEL Name of the model to load by default. + --lora LORA [LORA ...] The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces. + --model-dir MODEL_DIR Path to directory with all the models. + --lora-dir LORA_DIR Path to directory with all the loras. + --model-menu Show a model menu in the terminal when the web UI is first launched. + --settings SETTINGS Load the default interface settings from this yaml file. See user_data/settings-template.yaml for an example. If you create a file called + user_data/settings.yaml, this file will be loaded by default without the need to use the --settings flag. + --extensions EXTENSIONS [EXTENSIONS ...] The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. + --verbose Print the prompts to the terminal. + --idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again. + +Image model: + --image-model IMAGE_MODEL Name of the image model to select on startup (overrides saved setting). + --image-model-dir IMAGE_MODEL_DIR Path to directory with all the image models. + --image-dtype {bfloat16,float16} Data type for image model. + --image-attn-backend {flash_attention_2,sdpa} Attention backend for image model. + --image-cpu-offload Enable CPU offloading for image model. + --image-compile Compile the image model for faster inference. + --image-quant {none,bnb-8bit,bnb-4bit,torchao-int8wo,torchao-fp4,torchao-float8wo} + Quantization method for image model. Model loader: - --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, - TensorRT-LLM. + --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, + ExLlamav2, TensorRT-LLM. Context and cache: - --ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens. - --cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits - separately, e.g. q4_q8). + --ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens. llama.cpp: 0 = auto if gpu-layers is also -1. + --cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and + v_bits separately, e.g. q4_q8). Speculative decoding: - --model-draft MODEL_DRAFT Path to the draft model for speculative decoding. - --draft-max DRAFT_MAX Number of tokens to draft for speculative decoding. - --gpu-layers-draft GPU_LAYERS_DRAFT Number of layers to offload to the GPU for the draft model. - --device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1 - --ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model. + --model-draft MODEL_DRAFT Path to the draft model for speculative decoding. + --draft-max DRAFT_MAX Number of tokens to draft for speculative decoding. + --gpu-layers-draft GPU_LAYERS_DRAFT Number of layers to offload to the GPU for the draft model. + --device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1 + --ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model. + --spec-type {none,ngram-mod,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-cache} + Draftless speculative decoding type. Recommended: ngram-mod. + --spec-ngram-size-n SPEC_NGRAM_SIZE_N N-gram lookup size for ngram speculative decoding. + --spec-ngram-size-m SPEC_NGRAM_SIZE_M Draft n-gram size for ngram speculative decoding. + --spec-ngram-min-hits SPEC_NGRAM_MIN_HITS Minimum n-gram hits for ngram-map speculative decoding. llama.cpp: - --gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU. - --mmproj MMPROJ Path to the mmproj file for vision models. - --streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed. - --tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40. - --row-split Split the model by rows across GPUs. This may improve multi-gpu performance. - --no-mmap Prevent mmap from being used. - --mlock Force the system to keep the model in RAM. - --no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. - --batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval. - --threads THREADS Number of threads to use. - --threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing. - --numa Activate NUMA task allocation for llama.cpp. - --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU" + --gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU. -1 = auto. + --cpu-moe Move the experts to the CPU (for MoE models). + --mmproj MMPROJ Path to the mmproj file for vision models. + --streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed. + --tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40. + --row-split Split the model by rows across GPUs. This may improve multi-gpu performance. + --no-mmap Prevent mmap from being used. + --mlock Force the system to keep the model in RAM. + --no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. + --batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size. + --ubatch-size UBATCH_SIZE Maximum number of prompt tokens to batch together when calling llama-server. This is the max physical batch size for computation (device level). + --threads THREADS Number of threads to use. + --threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing. + --numa Activate NUMA task allocation for llama.cpp. + --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU" Transformers/Accelerate: - --cpu Use the CPU to generate text. Warning: Training on CPU is extremely slow. - --cpu-memory CPU_MEMORY Maximum CPU memory in GiB. Use this for CPU offloading. - --disk If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. - --disk-cache-dir DISK_CACHE_DIR Directory to save the disk cache to. Defaults to "user_data/cache". - --load-in-8bit Load the model with 8-bit precision (using bitsandbytes). - --bf16 Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. - --no-cache Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost. - --trust-remote-code Set trust_remote_code=True while loading the model. Necessary for some models. - --force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution. - --no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. - --attn-implementation IMPLEMENTATION Attention implementation. Valid options: sdpa, eager, flash_attention_2. + --cpu Use the CPU to generate text. Warning: Training on CPU is extremely slow. + --cpu-memory CPU_MEMORY Maximum CPU memory in GiB. Use this for CPU offloading. + --disk If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. + --disk-cache-dir DISK_CACHE_DIR Directory to save the disk cache to. Defaults to "user_data/cache". + --load-in-8bit Load the model with 8-bit precision (using bitsandbytes). + --bf16 Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. + --no-cache Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost. + --trust-remote-code Set trust_remote_code=True while loading the model. Necessary for some models. + --force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution. + --no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. + --attn-implementation IMPLEMENTATION Attention implementation. Valid options: sdpa, eager, flash_attention_2. bitsandbytes 4-bit: - --load-in-4bit Load the model with 4-bit precision (using bitsandbytes). - --use_double_quant use_double_quant for 4-bit. - --compute_dtype COMPUTE_DTYPE compute dtype for 4-bit. Valid options: bfloat16, float16, float32. - --quant_type QUANT_TYPE quant_type for 4-bit. Valid options: nf4, fp4. + --load-in-4bit Load the model with 4-bit precision (using bitsandbytes). + --use_double_quant use_double_quant for 4-bit. + --compute_dtype COMPUTE_DTYPE compute dtype for 4-bit. Valid options: bfloat16, float16, float32. + --quant_type QUANT_TYPE quant_type for 4-bit. Valid options: nf4, fp4. ExLlamaV3: - --enable-tp, --enable_tp Enable Tensor Parallelism (TP) to split the model across GPUs. - --tp-backend TP_BACKEND The backend for tensor parallelism. Valid options: native, nccl. Default: native. + --enable-tp, --enable_tp Enable Tensor Parallelism (TP) to split the model across GPUs. + --tp-backend TP_BACKEND The backend for tensor parallelism. Valid options: native, nccl. Default: native. ExLlamaV2: - --gpu-split GPU_SPLIT Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. - --autosplit Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored. - --cfg-cache ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader. - --no_flash_attn Force flash-attention to not be used. - --no_xformers Force xformers to not be used. - --no_sdpa Force Torch SDPA to not be used. - --num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral. + --gpu-split GPU_SPLIT Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. + --autosplit Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored. + --cfg-cache ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader. + --no_flash_attn Force flash-attention to not be used. + --no_xformers Force xformers to not be used. + --no_sdpa Force Torch SDPA to not be used. + --num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral. TensorRT-LLM: - --cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet. + --cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner. RoPE: - --alpha_value ALPHA_VALUE Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. - --rope_freq_base ROPE_FREQ_BASE If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63). - --compress_pos_emb COMPRESS_POS_EMB Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale. + --alpha_value ALPHA_VALUE Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. + --rope_freq_base ROPE_FREQ_BASE If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63). + --compress_pos_emb COMPRESS_POS_EMB Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale. Gradio: - --listen Make the web UI reachable from your local network. - --listen-port LISTEN_PORT The listening port that the server will use. - --listen-host LISTEN_HOST The hostname that the server will use. - --share Create a public URL. This is useful for running the web UI on Google Colab or similar. - --auto-launch Open the web UI in the default browser upon launch. - --gradio-auth GRADIO_AUTH Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3". - --gradio-auth-path GRADIO_AUTH_PATH Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above. - --ssl-keyfile SSL_KEYFILE The path to the SSL certificate key file. - --ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file. - --subpath SUBPATH Customize the subpath for gradio, use with reverse proxy - --old-colors Use the legacy Gradio colors, before the December/2024 update. - --portable Hide features not available in portable mode like training. + --listen Make the web UI reachable from your local network. + --listen-port LISTEN_PORT The listening port that the server will use. + --listen-host LISTEN_HOST The hostname that the server will use. + --share Create a public URL. This is useful for running the web UI on Google Colab or similar. + --auto-launch Open the web UI in the default browser upon launch. + --gradio-auth GRADIO_AUTH Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3". + --gradio-auth-path GRADIO_AUTH_PATH Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above. + --ssl-keyfile SSL_KEYFILE The path to the SSL certificate key file. + --ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file. + --subpath SUBPATH Customize the subpath for gradio, use with reverse proxy + --old-colors Use the legacy Gradio colors, before the December/2024 update. + --portable Hide features not available in portable mode like training. API: - --api Enable the API extension. - --public-api Create a public URL for the API using Cloudfare. - --public-api-id PUBLIC_API_ID Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. - --api-port API_PORT The listening port for the API. - --api-key API_KEY API authentication key. - --admin-key ADMIN_KEY API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key. - --api-enable-ipv6 Enable IPv6 for the API - --api-disable-ipv4 Disable IPv4 for the API - --nowebui Do not launch the Gradio UI. Useful for launching the API in standalone mode. + --api Enable the API extension. + --public-api Create a public URL for the API using Cloudfare. + --public-api-id PUBLIC_API_ID Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. + --api-port API_PORT The listening port for the API. + --api-key API_KEY API authentication key. + --admin-key ADMIN_KEY API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key. + --api-enable-ipv6 Enable IPv6 for the API + --api-disable-ipv4 Disable IPv4 for the API + --nowebui Do not launch the Gradio UI. Useful for launching the API in standalone mode. ``` @@ -390,20 +410,17 @@ text-generation-webui text-generation-webui └── user_data └── models - └── lmsys_vicuna-33b-v1.3 + └── Qwen_Qwen3-8B ├── config.json ├── generation_config.json - ├── pytorch_model-00001-of-00007.bin - ├── pytorch_model-00002-of-00007.bin - ├── pytorch_model-00003-of-00007.bin - ├── pytorch_model-00004-of-00007.bin - ├── pytorch_model-00005-of-00007.bin - ├── pytorch_model-00006-of-00007.bin - ├── pytorch_model-00007-of-00007.bin - ├── pytorch_model.bin.index.json + ├── model-00001-of-00004.safetensors + ├── model-00002-of-00004.safetensors + ├── model-00003-of-00004.safetensors + ├── model-00004-of-00004.safetensors + ├── model.safetensors.index.json ├── special_tokens_map.json ├── tokenizer_config.json - └── tokenizer.model + └── tokenizer.json ``` In both cases, you can use the "Model" tab of the UI to download the model from Hugging Face automatically. It is also possible to download it via the command-line with: diff --git a/docs/01 - Chat Tab.md b/docs/01 - Chat Tab.md index 4b177b80..5104895f 100644 --- a/docs/01 - Chat Tab.md +++ b/docs/01 - Chat Tab.md @@ -2,31 +2,44 @@ Used to have multi-turn conversations with the model. ## Input area -The following buttons can be found. Note that the hover menu can be replaced with always-visible buttons with the `--chat-buttons` flag. +The main action buttons are: -* **Generate**: sends your message and makes the model start a reply. +* **Send**: sends your message and makes the model start a reply. * **Stop**: stops an ongoing generation as soon as the next token is generated (which can take a while for a slow model). + +The hover menu (☰) that appears over the chat area contains: + +* **Regenerate**: similar to Send, but your last message is used as input instead of the text in the input field. Note that if the temperature/top_p/top_k parameters are low in the "Parameters" tab of the UI, the new reply may end up identical to the previous one. * **Continue**: makes the model attempt to continue the existing reply. In some cases, the model may simply end the existing turn immediately without generating anything new, but in other cases, it may generate a longer reply. -* **Regenerate**: similar to Generate, but your last message is used as input instead of the text in the input field. Note that if the temperature/top_p/top_k parameters are low in the "Parameters" tab of the UI, the new reply may end up identical to the previous one. * **Remove last reply**: removes the last input/output pair from the history and sends your last message back into the input field. -* **Replace last reply**: replaces the last reply with whatever you typed into the input field. Useful in conjunction with "Copy last reply" if you want to edit the bot response. -* **Copy last reply**: sends the contents of the bot's last reply to the input field. * **Impersonate**: makes the model generate a new message on your behalf in the input field, taking into consideration the existing chat history. * **Send dummy message**: adds a new message to the chat history without causing the model to generate a reply. * **Send dummy reply**: adds a new reply to the chat history as if the model had generated this reply. Useful in conjunction with "Send dummy message". -* **Start new chat**: starts a new conversation while keeping the old one saved. If you are talking to a character that has a "Greeting" message defined, this message will be automatically added to the new history. -* **Send to default**: sends the entire chat prompt up to now to the "Default" tab. -* **Send to notebook**: sends the entire chat prompt up to now to the "Notebook" tab. - -The **Show controls** checkbox causes the input fields below the input textbox to disappear. It is useful for making the page fit entirely into view and not scroll. +* **Send to Notebook**: sends the entire chat prompt up to now to the Notebook tab. +* **Show controls**: checkbox that toggles the visibility of the sidebar controls (Start reply with, Mode, Chat style, etc.). Shortcut: Ctrl+S. ## Past chats -Allows you to switch between the current and previous conversations with the current character, or between the current and previous instruct conversations (if in "instruct" mode). The **Rename** menu can be used to give a unique name to the selected conversation, and the 🗑️ button allows you to delete it. +Allows you to switch between the current and previous conversations with the current character, or between the current and previous instruct conversations (if in "instruct" mode). The available buttons are: -## Start reply with +* **Branch**: creates a branch of the current conversation at a specific message. +* **Rename**: allows you to give a unique name to the selected conversation. +* **🗑️**: deletes the selected conversation. +* **New chat**: starts a new conversation. If you are talking to a character that has a "Greeting" message defined, this message will be automatically added to the new history. -Whatever you type there will appear at the start of every reply by the bot. This is useful to guide the response in the desired direction. +A search field is also available to filter conversations by name. + +## Sidebar controls + +The sidebar (toggled via "Show controls") contains: + +* **Start reply with**: whatever you type there will appear at the start of every reply by the bot. This is useful to guide the response in the desired direction. +* **Reasoning effort**: controls the thinking depth for models that support it. Options: low, medium, high. +* **Enable thinking**: enables extended thinking mode for models that support it. +* **Activate web search**: when enabled, the model can search the web for information before replying. You can also set the number of pages to download. +* **Mode**: see below. +* **Chat style**: see below. +* **Command for chat-instruct mode**: the command that is used in chat-instruct mode to query the model to generate a reply on behalf of the character. Can be used creatively to generate specific kinds of responses. Inside this string, `<|character|>` is a placeholder that gets replaced with the bot name, and `<|prompt|>` is a placeholder that gets replaced with the full chat prompt. ## Mode @@ -73,7 +86,7 @@ Now that an instruction-following model is defined, we can move on to describing ### Chat -Used for talking to the character defined under "Parameters" > "Character" using a simple chat prompt in this format: +Used for talking to the character defined under "Character" tab using a simple chat prompt in this format: ``` Chiharu Yamada's Persona: Chiharu Yamada is a young, computer engineer-nerd with a knack for problem solving and a passion for technology. @@ -83,7 +96,7 @@ You: How are you? Chiharu Yamada: I'm doing well, thank you for asking! Is there something specific you would like to talk about or ask me? I'm here to help answer any questions you may have. ``` -There are 3 adjustable parameters in "Parameters" > "Character" being used in this prompt: +There are 3 adjustable parameters in the "Character" tab being used in this prompt: * The **Context** string appears at the top of the prompt. Most often it describes the bot's personality and adds a few example messages to guide the model towards the desired reply length and format. This string never gets truncated: as the prompt size increases, old messages get removed one at a time until the prompt becomes smaller than the truncation length set under "Parameters" > "Generation" > "Truncate the prompt up to this length". * The **Your name** string appears at the beginning of each user reply. By default, this string is "You". @@ -99,7 +112,7 @@ Used for talking to an instruction-following model using the prompt format defin The prompt format is defined by the **Instruction template** parameter in "Parameters" > "Instruction template", which represents a Jinja2 template. -Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any), and will update the values under "Parameters" > "Instruction template" accordingly. This is done using a set of regular expressions defined in `models/config.yaml`. This detection is not guaranteed to be accurate. You should check the model card on Hugging Face to see if you are using the correct prompt format. +Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any), and will update the values under "Parameters" > "Instruction template" accordingly. This is done using a set of regular expressions defined in `user_data/models/config.yaml`. This detection is not guaranteed to be accurate. You should check the model card on Hugging Face to see if you are using the correct prompt format. ### Chat-instruct @@ -127,8 +140,6 @@ Here, the command is Below this command, the regular chat prompt is added, including its Context string and the chat history, and then the user turn ends. The bot turn starts with the "Character's name" string followed by `:`, thus prompting the instruction-following model to write a single reply for the character. -The chat-instruct command can be customized under "Parameters" > "Instruction template" > "Command for chat-instruct mode". Inside that command string, `<|character|>` is a placeholder that gets replaced with the bot name, and `<|prompt|>` is a placeholder that gets replaced with the full chat prompt. - Note that you can get creative: instead of writing something trivial like "Write a single reply for the character", you could add more complex instructions like > This is an adventure game, and your task is to write a reply in name of "<|character|>" where 3 options are given for the user to then choose from. @@ -145,4 +156,4 @@ The styles are only applied to chat and chat-instruct modes. Instruct mode has i ## Character gallery -This menu is a built-in extension defined under `text-generation-webui/extensions/gallery`. It displays a gallery with your characters, and if you click on a character, it will be automatically selected in the menu under "Parameters" > "Character". +This menu is a built-in extension defined under `text-generation-webui/extensions/gallery`. It displays a gallery with your characters, and if you click on a character, it will be automatically selected in the Character tab. diff --git a/docs/02 - Default and Notebook Tabs.md b/docs/02 - Default and Notebook Tabs.md index 4bb78448..ba1028ab 100644 --- a/docs/02 - Default and Notebook Tabs.md +++ b/docs/02 - Default and Notebook Tabs.md @@ -10,11 +10,11 @@ The number on the lower right of the Input box counts the number of tokens in th Below the Input box, the following buttons can be found: +* **Continue**: starts a new generation taking as input the text in the "Output" box. * **Generate**: starts a new generation. * **Stop**: stops an ongoing generation as soon as the next token is generated (which can take a while for a slow model). -* **Continue**: starts a new generation taking as input the text in the "Output" box. -In the **Prompt** menu, you can select from some predefined prompts defined under `text-generation-webui/prompts`. The 💾 button saves your current input as a new prompt, the 🗑️ button deletes the selected prompt, and the 🔄 button refreshes the list. If you come up with an interesting prompt for a certain task, you are welcome to submit it to the repository. +In the **Prompt** menu, you can select from saved prompts stored in `user_data/logs/notebook`. The **New** button creates a new prompt, the **Rename** button renames the selected prompt, and the 🗑️ button deletes it. The 🔄 button refreshes the list. ### Output @@ -28,7 +28,7 @@ Five tabs can be found: ## Notebook tab -Precisely the same thing as the Default tab, with the difference that the output appears in the same text box as the input. +Precisely the same thing as the Default tab, with the difference that the output appears in the same text box as the input. It contains the following additional button: diff --git a/docs/03 - Parameters Tab.md b/docs/03 - Parameters Tab.md index 8c758ce6..51795054 100644 --- a/docs/03 - Parameters Tab.md +++ b/docs/03 - Parameters Tab.md @@ -43,9 +43,15 @@ For more information about the parameters, the [transformers documentation](http * **presence_penalty**: Similar to repetition_penalty, but with an additive offset on the raw token scores instead of a multiplicative factor. It may generate better results. 0 means no penalty, higher value = less repetition, lower value = more repetition. Previously called "additive_repetition_penalty". * **frequency_penalty**: Repetition penalty that scales based on how many times the token has appeared in the context. Be careful with this; there's no limit to how much a token can be penalized. * **repetition_penalty_range**: The number of most recent tokens to consider for repetition penalty. 0 makes all tokens be used. +* **dry_multiplier**: Set to greater than 0 to enable DRY (Don't Repeat Yourself) sampling. It penalizes tokens that would extend a sequence that already appeared in the context. Recommended value: 0.8. +* **dry_allowed_length**: The longest sequence that can be repeated without being penalized by DRY. Shorter values make DRY more aggressive. +* **dry_base**: Controls how fast the DRY penalty grows with increasing sequence length. * **typical_p**: If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text. * **tfs**: Tries to detect a tail of low-probability tokens in the distribution and removes those tokens. See [this blog post](https://www.trentonbricken.com/Tail-Free-Sampling/) for details. The closer to 0, the more discarded tokens. * **top_a**: Tokens with probability smaller than `(top_a) * (probability of the most likely token)^2` are discarded. +* **top_n_sigma**: Keeps only tokens within N standard deviations of the mean log-probability. Acts as an adaptive cutoff that adjusts to the shape of the distribution. 0 disables it. +* **xtc_threshold**: eXclusion from Top Choices (XTC) sampling. If 2 or more tokens have probability above this threshold, the top token may be removed. This encourages the model to use less common word choices and can increase creativity. +* **xtc_probability**: The probability that XTC removal will actually happen when the threshold condition is met. Set to 1 for it to always apply, or lower for occasional application. * **epsilon_cutoff**: In units of 1e-4; a reasonable value is 3. This sets a probability floor below which tokens are excluded from being sampled. * **eta_cutoff**: In units of 1e-4; a reasonable value is 3. The main parameter of the special Eta Sampling technique. See [this paper](https://arxiv.org/pdf/2210.15191.pdf) for a description. * **guidance_scale**: The main parameter for Classifier-Free Guidance (CFG). [The paper](https://arxiv.org/pdf/2306.17806.pdf) suggests that 1.5 is a good value. It can be used in conjunction with a negative prompt or not. @@ -55,9 +61,12 @@ For more information about the parameters, the [transformers documentation](http *Note: Use either mirostat or dynamic_temperature, not both at the same time.* * **mirostat_tau**: Target perplexity for Mirostat sampling. Controls how “surprising” the text is. Higher values = more diverse, lower = more predictable. Preset Arena suggests 8 as a good value. * **mirostat_eta**: Learning rate for Mirostat’s perplexity adjustment. Higher values = adapts faster but less stable, lower values = slower but more stable. Preset Arena suggests 0.1 as a good value. +* **adaptive_target**: Target probability for adaptive-p sampling. This method adjusts the sampling threshold dynamically based on an exponential moving average of recent token probabilities. 0 disables it. +* **adaptive_decay**: EMA decay rate for adaptive-p sampling. Controls how quickly the running average adjusts. Default: 0.9. * **dynamic_temperature**: Activates Dynamic Temperature. This modifies temperature to range between "dynatemp_low" (minimum) and "dynatemp_high" (maximum), with an entropy-based scaling. The steepness of the curve is controlled by "dynatemp_exponent". *Note: Use either dynamic_temperature or mirostat, not both at the same time.* * **smoothing_factor**: Activates Quadratic Sampling. When `0 < smoothing_factor < 1`, the logits distribution becomes flatter. When `smoothing_factor > 1`, it becomes more peaked. +* **smoothing_curve**: Adjusts the dropoff curve of Quadratic Sampling. Higher values make the curve steeper. Only takes effect when smoothing_factor is set. * **temperature_last**: Makes temperature the last sampler instead of the first. With this, you can remove low probability tokens with a sampler like min_p and then use a high temperature to make the model creative without losing coherency. Note: this parameter takes precedence over "Sampler priority". That means that `temperature`/`dynamic_temperature`/`quadratic_sampling` will be removed from wherever they are and moved to the end of the stack. * **do_sample**: When unchecked, sampling is entirely disabled, and greedy decoding is used instead (the most likely token is always picked). * **Seed**: Set the Pytorch seed to this number. Note that some loaders do not use Pytorch (notably llama.cpp), and others are not deterministic (ExLlamaV2). For these loaders, the seed has no effect. @@ -66,25 +75,48 @@ For more information about the parameters, the [transformers documentation](http To the right (or below if you are on mobile), the following parameters are present: -* **Truncate the prompt up to this length**: Used to prevent the prompt from getting bigger than the model's context length. In the case of the transformers loader, which allocates memory dynamically, this parameter can also be used to set a VRAM ceiling and prevent out-of-memory errors. This parameter is automatically updated with the model's context length (from "n_ctx" or "max_seq_len" for loaders that use these parameters, and from the model metadata directly for loaders that do not) when you load a model. +* **Truncate the prompt up to this length**: Used to prevent the prompt from getting bigger than the model's context length. In the case of the transformers loader, which allocates memory dynamically, this parameter can also be used to set a VRAM ceiling and prevent out-of-memory errors. This parameter is automatically updated with the model's context length (from "ctx_size" for loaders that use this parameter, and from the model metadata directly for loaders that do not) when you load a model. * **Maximum number of tokens/second**: to make text readable in real-time in case the model is generating too fast. Good if you want to flex and tell everyone how good your GPU is. +* **Custom system message**: If not empty, will be used instead of the default system message in the instruction template. Useful for customizing the personality of the chatbot. Example: "You are a duck." * **Custom stopping strings**: The model stops generating as soon as any of the strings set in this field is generated. Note that when generating text in the Chat tab, some default stopping strings are set regardless of this parameter, like "\nYour Name:" and "\nBot name:" for chat mode. That's why this parameter has a "Custom" in its name. * **Custom token bans**: Allows you to ban the model from generating certain tokens altogether. You need to find the token IDs under "Default" > "Tokens" or "Notebook" > "Tokens", or by looking at the `tokenizer.json` for the model directly. * **auto_max_new_tokens**: When checked, the max_new_tokens parameter is expanded in the backend to the available context length. The maximum length is given by the "truncation_length" parameter. This is useful for getting long replies in the Chat tab without having to click on "Continue" many times. * **Ban the eos_token**: One of the possible tokens that a model can generate is the EOS (End of Sequence) token. When it is generated, the generation stops prematurely. When this parameter is checked, that token is banned from being generated, and the generation will always generate "max_new_tokens" tokens. * **Add the bos_token to the beginning of prompts**: By default, the tokenizer will add a BOS (Beginning of Sequence) token to your prompt. During training, BOS tokens are used to separate different documents. If unchecked, no BOS token will be added, and the model will interpret your prompt as being in the middle of a document instead of at the start of one. This significantly changes the output and can make it more creative. * **Skip special tokens**: When decoding the generated tokens, skip special tokens from being converted to their text representation. Otherwise, BOS appears as ``, EOS as ``, etc. +* **prompt_lookup_num_tokens**: Activates Prompt Lookup Decoding, a form of speculative decoding for the Transformers loader. It guesses future tokens by looking for matching patterns in the prompt itself, which can speed up generation for tasks that involve repeating or paraphrasing parts of the input. * **Activate text streaming**: When unchecked, the full response is outputted at once, without streaming the words one at a time. I recommend unchecking this parameter on high latency networks like running the webui on Google Colab or using `--share`. +* **Static KV cache**: Use a static cache for improved performance with the Transformers loader. May not be compatible with all models. * **Sampler priority**: Allows you to customize the order in which the different samplers are applied. The first sampler on the list gets applied first. With this, custom orders like `top_p -> temperature -> top_k` can be defined. -* **Load grammar from file**: Loads a GBNF grammar from a file under `text-generation-webui/grammars`. The output is written to the "Grammar" box below. You can also save and delete custom grammars using this menu. +* **DRY sequence breakers**: Tokens across which DRY sequence matching is not continued. Typically punctuation and special tokens. Only used when DRY is active (dry_multiplier > 0). +* **Load grammar from file**: Loads a GBNF grammar from a file under `user_data/grammars`. The output is written to the "Grammar" box below. You can also save and delete custom grammars using this menu. * **Grammar**: Allows you to constrain the model output to a particular format. For instance, you can make the model generate lists, JSON, specific words, etc. Grammar is extremely powerful and I highly recommend it. The syntax looks a bit daunting at first sight, but it gets very easy once you understand it. See the [GBNF Guide](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md) for details. -## Character +### Chat tab controls -Parameters that define the character that is used in the Chat tab when "chat" or "chat-instruct" are selected under "Mode". +The following parameters appear in the Chat tab sidebar rather than the Parameters tab: -* **Character**: A dropdown menu where you can select from saved characters, save a new character (💾 button), and delete the selected character (🗑️). -* **Your name**: Your name as it appears in the prompt. +* **reasoning_effort**: Controls the thinking depth for models that support it (used by GPT-OSS). Options: low, medium, high. +* **enable_thinking**: Enables extended thinking mode for models that support it (used by Seed-OSS and pre-2507 Qwen3). When enabled, the model can use a thinking step before generating its reply. + +## Instruction template + +This sub-tab within the Parameters tab defines the instruction template used in the Chat tab when "instruct" or "chat-instruct" are selected under "Mode". + +* **Saved instruction templates**: A dropdown menu where you can select a template. Click **Load** to apply it. The 💾 button saves the current template, and the 🗑️ button deletes the selected one. +* **Instruction template**: A Jinja2 template that defines the prompt format for the instruction-following conversation. +* **Send to notebook**: Send the full instruction template in string format to the Notebook tab. +* **Chat template**: A Jinja2 template that defines the prompt format for regular chat conversations with characters. + +## Character tab + +The Character tab is a separate top-level tab that contains the following sub-tabs: + +### Character + +Parameters that define the character used in the Chat tab when "chat" or "chat-instruct" are selected under "Mode". + +* **Character**: A dropdown menu where you can select from saved characters, save a new character (💾 button), and delete the selected character (🗑️). The **Restore character** button resets the character to its last saved state. * **Character's name**: The bot name as it appears in the prompt. * **Context**: A string that is always at the top of the prompt. It never gets truncated. It usually defines the bot's personality and some key elements of the conversation. * **Greeting**: An opening message for the bot. When set, it appears whenever you start a new chat. @@ -98,31 +130,26 @@ Note: the following replacements take place in the context and greeting fields w So you can use those special placeholders in your character definitions. They are commonly found in TavernAI character cards. -## Instruction template +### User -Defines the instruction template that is used in the Chat tab when "instruct" or "chat-instruct" are selected under "Mode". +Allows you to create and manage user profiles. -* **Saved instruction templates**: A dropdown menu where you can load a saved template, save a new template (💾 button), and delete the currently selected template (🗑️). -* **Custom system message**: A message that defines the personality of the chatbot, replacing its default "System message" string. Example: "You are a duck." -* **Instruction template**: A Jinja2 template that defines the prompt format for the instruction-following conversation. -* **Send to default**: Send the full instruction template in string format to the Default tab. -* **Send to notebook**: Send the full instruction template in string format to the Notebook tab. -* **Send to negative prompt**: Send the full instruction template in string format to the "Negative prompt" field under "Parameters" > "Generation". -* **Chat template**: A Jinja2 template that defines the prompt format for regular chat conversations with characters. -* **Command for chat-instruct mode**: The command that is used in chat-instruct mode to query the model to generate a reply on behalf of the character. Can be used creatively to generate specific kinds of responses. +* **User**: A dropdown to select, save (💾), or delete (🗑️) user profiles. +* **Name**: Your name as it appears in the prompt. +* **Description**: An optional description of yourself that can be referenced in conversations. -## Chat history +### Chat history -In this tab, you can download the current chat history in JSON format and upload a previously saved chat history. +In this tab, you can download the current chat history in JSON format and upload a previously saved chat history. When a history is uploaded, a new chat is created to hold it. That is, you don't lose your current chat in the Chat tab. -## Upload character +### Upload character -### YAML or JSON +#### YAML or JSON -Allows you to upload characters in the YAML format used by the web UI, including optionally a profile picture. +Allows you to upload characters in the YAML format used by the web UI, including optionally a profile picture. -### TavernAI PNG +#### TavernAI PNG Allows you to upload a TavernAI character card. It will be converted to the internal YAML format of the web UI after upload. diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md index f44eb964..30b94151 100644 --- a/docs/04 - Model Tab.md +++ b/docs/04 - Model Tab.md @@ -2,35 +2,81 @@ This is where you load models, apply LoRAs to a loaded model, and download new m ## Model loaders +### llama.cpp + +Loads: GGUF models. Note: GGML models have been deprecated and do not work anymore. + +Example: https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF + +* **gpu_layers**: The number of layers to allocate to the GPU. If set to 0, only the CPU will be used. If you want to offload all layers, you can simply set this to the maximum value. +* **ctx_size**: Context length of the model. In llama.cpp, the cache is preallocated, so the higher this value, the higher the VRAM. It is automatically set to the maximum sequence length for the model based on the metadata inside the GGUF file, but you may need to lower this value to fit the model into your GPU. Set to 0 for automatic context size based on available memory. After loading the model, the "Truncate the prompt up to this length" parameter under "Parameters" > "Generation" is automatically set to your chosen "ctx_size" so that you don't have to set the same thing twice. +* **cache_type**: KV cache quantization type. Valid options: `fp16`, `q8_0`, `q4_0`. Lower quantization saves VRAM at the cost of some quality. +* **tensor_split**: For multi-gpu only. Sets the amount of memory to allocate per GPU as proportions. Not to be confused with other loaders where this is set in GB; here you can set something like `30,70` for 30%/70%. +* **batch_size**: Maximum number of prompt tokens to batch together when calling llama_eval. +* **ubatch_size**: Physical maximum batch size for prompt processing. +* **threads**: Number of threads. Recommended value: your number of physical cores. +* **threads_batch**: Number of threads for batch processing. Recommended value: your total number of cores (physical + virtual). +* **cpu_moe**: Force MoE expert layers to run on the CPU, keeping the rest on the GPU. +* **extra_flags**: Extra flags to pass to llama-server. Format: `flag1=value1,flag2,flag3=value3`. Example: `override-tensor=exps=CPU`. +* **mmproj**: Path to the mmproj file for multimodal (vision) models. This enables image understanding capabilities. +* **streaming_llm**: Experimental feature to avoid re-evaluating the entire prompt when part of it is removed, for instance, when you hit the context length for the model in chat mode and an old message is removed. +* **cpu**: Force a version of llama.cpp compiled without GPU acceleration to be used. Can usually be ignored. Only set this if you want to use CPU only and llama.cpp doesn't work otherwise. +* **row_split**: Split the model by rows across GPUs. This may improve multi-gpu performance. +* **no_kv_offload**: Do not offload the KV cache to the GPU. This saves VRAM but reduces performance. +* **no_mmap**: Loads the model into memory at once, possibly preventing I/O operations later on at the cost of a longer load time. +* **mlock**: Force the system to keep the model in RAM rather than swapping or compressing. +* **numa**: May improve performance on certain multi-cpu systems. + ### Transformers -Loads: full precision (16-bit or 32-bit) models. The repository usually has a clean name without GGUF, EXL2, GPTQ, or AWQ in its name, and the model files are named `pytorch_model.bin` or `model.safetensors`. +Loads: full precision (16-bit or 32-bit) models, as well as bitsandbytes-quantized models. The repository usually has a clean name without GGUF, EXL2, or EXL3 in its name, and the model files are named `model.safetensors` or split into parts like `model-00001-of-00004.safetensors`. -Example: [https://huggingface.co/lmsys/vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5). +Example: [https://huggingface.co/lmsys/vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5). Full precision models use a ton of VRAM, so you will usually want to select the "load_in_4bit" and "use_double_quant" options to load the model in 4-bit precision using bitsandbytes. -This loader can also load GPTQ models and train LoRAs with them. For that, make sure to check the "auto-devices" and "disable_exllama" options before loading the model. - Options: -* **gpu-memory**: When set to greater than 0, activates CPU offloading using the accelerate library, where part of the layers go to the CPU. The performance is very bad. Note that accelerate doesn't treat this parameter very literally, so if you want the VRAM usage to be at most 10 GiB, you may need to set this parameter to 9 GiB or 8 GiB. It can be used in conjunction with "load_in_8bit" but not with "load-in-4bit" as far as I'm aware. -* **cpu-memory**: Similarly to the parameter above, you can also set a limit on the amount of CPU memory used. Whatever doesn't fit either in the GPU or the CPU will go to a disk cache, so to use this option you should also check the "disk" checkbox. -* **compute_dtype**: Used when "load-in-4bit" is checked. I recommend leaving the default value. -* **quant_type**: Used when "load-in-4bit" is checked. I recommend leaving the default value. +* **gpu_split**: When using multiple GPUs, sets the amount of VRAM in GB to allocate per GPU. Example: `20,7,7`. +* **cpu_memory**: Maximum CPU memory in GiB to use for CPU offloading via the accelerate library. Whatever doesn't fit in the GPU or CPU will go to a disk cache if the "disk" checkbox is enabled. +* **compute_dtype**: Used when "load_in_4bit" is checked. I recommend leaving the default value. +* **quant_type**: Used when "load_in_4bit" is checked. I recommend leaving the default value. * **alpha_value**: Used to extend the context length of a model with a minor loss in quality. I have measured 1.75 to be optimal for 1.5x context, and 2.5 for 2x context. That is, with alpha = 2.5 you can make a model with 4096 context length go to 8192 context length. * **rope_freq_base**: Originally another way to write "alpha_value", it ended up becoming a necessary parameter for some models like CodeLlama, which was fine-tuned with this set to 1000000 and hence needs to be loaded with it set to 1000000 as well. * **compress_pos_emb**: The first and original context-length extension method, discovered by [kaiokendev](https://kaiokendev.github.io/til). When set to 2, the context length is doubled, 3 and it's tripled, etc. It should only be used for models that have been fine-tuned with this parameter set to different than 1. For models that have not been tuned to have greater context length, alpha_value will lead to a smaller accuracy loss. -* **cpu**: Loads the model in CPU mode using Pytorch. The model will be loaded in 32-bit precision, so a lot of RAM will be used. CPU inference with transformers is older than llama.cpp and it works, but it's a lot slower. Note: this parameter has a different interpretation in the llama.cpp loader (see below). -* **load-in-8bit**: Load the model in 8-bit precision using bitsandbytes. The 8-bit kernel in that library has been optimized for training and not inference, so load-in-8bit is slower than load-in-4bit (but more accurate). +* **attn_implementation**: Choose the attention implementation. Valid options: `sdpa`, `eager`, `flash_attention_2`. The default (`sdpa`) works well in most cases; `flash_attention_2` may be useful for training. +* **cpu**: Loads the model in CPU mode using Pytorch. The model will be loaded in 32-bit precision, so a lot of RAM will be used. CPU inference with transformers is older than llama.cpp and it works, but it's a lot slower. Note: this parameter has a different interpretation in the llama.cpp loader (see above). +* **load_in_8bit**: Load the model in 8-bit precision using bitsandbytes. The 8-bit kernel in that library has been optimized for training and not inference, so load_in_8bit is slower than load_in_4bit (but more accurate). * **bf16**: Use bfloat16 precision instead of float16 (the default). Only applies when quantization is not used. -* **auto-devices**: When checked, the backend will try to guess a reasonable value for "gpu-memory" to allow you to load a model with CPU offloading. I recommend just setting "gpu-memory" manually instead. This parameter is also needed for loading GPTQ models, in which case it needs to be checked before loading the model. * **disk**: Enable disk offloading for layers that don't fit into the GPU and CPU combined. -* **load-in-4bit**: Load the model in 4-bit precision using bitsandbytes. +* **load_in_4bit**: Load the model in 4-bit precision using bitsandbytes. +* **use_double_quant**: Use double quantization with 4-bit loading for reduced memory usage. * **trust-remote-code**: Some models use custom Python code to load the model or the tokenizer. For such models, this option needs to be set. It doesn't download any remote content: all it does is execute the .py files that get downloaded with the model. Those files can potentially include malicious code; I have never seen it happen, but it is in principle possible. * **no_use_fast**: Do not use the "fast" version of the tokenizer. Can usually be ignored; only check this if you can't load the tokenizer for your model otherwise. -* **use_flash_attention_2**: Set use_flash_attention_2=True while loading the model. Possibly useful for training. -* **disable_exllama**: Only applies when you are loading a GPTQ model through the transformers loader. It needs to be checked if you intend to train LoRAs with the model. + +### ExLlamav3_HF + +Loads: EXL3 models. These models usually have "EXL3" or "exl3" in the model name. + +Uses the ExLlamaV3 backend with Transformers samplers. + +* **ctx_size**: Context length of the model. The cache is preallocated, so the higher this value, the higher the VRAM. It is automatically set to the maximum sequence length for the model based on its metadata, but you may need to lower this value to fit the model into your GPU. After loading the model, the "Truncate the prompt up to this length" parameter under "Parameters" > "Generation" is automatically set to your chosen "ctx_size" so that you don't have to set the same thing twice. +* **cache_type**: KV cache quantization type. Valid options: `fp16`, `q2` to `q8`. You can also specify key and value bits separately, e.g. `q4_q8`. Lower quantization saves VRAM at the cost of some quality. +* **gpu_split**: Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: `20,7,7`. +* **cfg_cache**: Creates a second cache to hold the CFG negative prompts. You need to set this if and only if you intend to use CFG in the "Parameters" > "Generation" tab. Checking this parameter doubles the cache VRAM usage. +* **no_use_fast**: Do not use the "fast" version of the tokenizer. +* **enable_tp**: Enable Tensor Parallelism (TP) to split the model across GPUs. +* **tp_backend**: The backend for tensor parallelism. Valid options: `native`, `nccl`. Default: `native`. + +### ExLlamav3 + +The same as ExLlamav3_HF but using the internal samplers of ExLlamaV3 instead of the ones in the Transformers library. Supports speculative decoding with a draft model. Also supports multimodal (vision) models natively. + +* **ctx_size**: Same as ExLlamav3_HF. +* **cache_type**: Same as ExLlamav3_HF. +* **gpu_split**: Same as ExLlamav3_HF. +* **enable_tp**: Enable Tensor Parallelism (TP) to split the model across GPUs. +* **tp_backend**: The backend for tensor parallelism. Valid options: `native`, `nccl`. Default: `native`. ### ExLlamav2_HF @@ -41,73 +87,34 @@ Examples: * https://huggingface.co/turboderp/Llama2-70B-exl2 * https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ -* **gpu-split**: If you have multiple GPUs, the amount of memory to allocate per GPU should be set in this field. Make sure to set a lower value for the first GPU, as that's where the cache is allocated. -* **max_seq_len**: The maximum sequence length for the model. In ExLlamaV2, the cache is preallocated, so the higher this value, the higher the VRAM. It is automatically set to the maximum sequence length for the model based on its metadata, but you may need to lower this value be able to fit the model into your GPU. After loading the model, the "Truncate the prompt up to this length" parameter under "Parameters" > "Generation" is automatically set to your chosen "max_seq_len" so that you don't have to set the same thing twice. -* **cfg-cache**: Creates a second cache to hold the CFG negative prompts. You need to set this if and only if you intend to use CFG in the "Parameters" > "Generation" tab. Checking this parameter doubles the cache VRAM usage. +* **ctx_size**: Context length of the model. In ExLlamaV2, the cache is preallocated, so the higher this value, the higher the VRAM. It is automatically set to the maximum sequence length for the model based on its metadata, but you may need to lower this value to fit the model into your GPU. After loading the model, the "Truncate the prompt up to this length" parameter under "Parameters" > "Generation" is automatically set to your chosen "ctx_size" so that you don't have to set the same thing twice. +* **cache_type**: KV cache quantization type. Valid options: `fp16`, `fp8`, `q8`, `q6`, `q4`. Lower quantization saves VRAM at the cost of some quality. +* **gpu_split**: Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: `20,7,7`. Make sure to set a lower value for the first GPU, as that's where the cache is allocated. +* **alpha_value**: Used to extend the context length of a model with a minor loss in quality. Same as the Transformers parameter. +* **compress_pos_emb**: Positional embeddings compression factor. Same as the Transformers parameter. +* **num_experts_per_token**: Number of experts to use for generation. Applies to MoE models like Mixtral. +* **autosplit**: Autosplit the model tensors across the available GPUs. This causes gpu_split to be ignored. +* **enable_tp**: Enable Tensor Parallelism (TP) to split the model across GPUs. * **no_flash_attn**: Disables flash attention. Otherwise, it is automatically used as long as the library is installed. -* **cache_8bit**: Create a 8-bit precision cache instead of a 16-bit one. This saves VRAM but increases perplexity (I don't know by how much). -* **cache_4bit**: Creates a Q4 cache using grouped quantization. +* **no_xformers**: Force xformers to not be used. +* **no_sdpa**: Force Torch SDPA to not be used. +* **cfg_cache**: Creates a second cache to hold the CFG negative prompts. You need to set this if and only if you intend to use CFG in the "Parameters" > "Generation" tab. Checking this parameter doubles the cache VRAM usage. +* **no_use_fast**: Do not use the "fast" version of the tokenizer. ### ExLlamav2 -The same as ExLlamav2_HF but using the internal samplers of ExLlamav2 instead of the ones in the Transformers library. +The same as ExLlamav2_HF but using the internal samplers of ExLlamaV2 instead of the ones in the Transformers library. Supports speculative decoding with a draft model. -### AutoGPTQ +### TensorRT-LLM -Loads: GPTQ models. +Loads: TensorRT-LLM engine models. These are highly optimized models compiled specifically for NVIDIA GPUs. -* **wbits**: For ancient models without proper metadata, sets the model precision in bits manually. Can usually be ignored. -* **groupsize**: For ancient models without proper metadata, sets the model group size manually. Can usually be ignored. -* **triton**: Only available on Linux. Necessary to use models with both act-order and groupsize simultaneously. Note that ExLlamaV2 can load these same models on Windows without triton. -* **no_inject_fused_attention**: Improves performance while increasing the VRAM usage. -* **no_inject_fused_mlp**: Similar to the previous parameter but for Triton only. -* **no_use_cuda_fp16**: On some systems, the performance can be very bad with this unset. Can usually be ignored. -* **desc_act**: For ancient models without proper metadata, sets the model "act-order" parameter manually. Can usually be ignored. - -### llama.cpp - -Loads: GGUF models. Note: GGML models have been deprecated and do not work anymore. - -Example: https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF - -* **n-gpu-layers**: The number of layers to allocate to the GPU. If set to 0, only the CPU will be used. If you want to offload all layers, you can simply set this to the maximum value. -* **n_ctx**: Context length of the model. In llama.cpp, the cache is preallocated, so the higher this value, the higher the VRAM. It is automatically set to the maximum sequence length for the model based on the metadata inside the GGUF file, but you may need to lower this value be able to fit the model into your GPU. After loading the model, the "Truncate the prompt up to this length" parameter under "Parameters" > "Generation" is automatically set to your chosen "n_ctx" so that you don't have to set the same thing twice. -* **tensor_split**: For multi-gpu only. Sets the amount of memory to allocate per GPU as proportions. Not to be confused with other loaders where this is set in GB; here you can set something like `30,70` for 30%/70%. -* **n_batch**: Batch size for prompt processing. Higher values are supposed to make generation faster, but I have never obtained any benefit from changing this value. -* **threads**: Number of threads. Recommended value: your number of physical cores. -* **threads_batch**: Number of threads for batch processing. Recommended value: your total number of cores (physical + virtual). -* **tensorcores**: Use llama.cpp compiled with "tensor cores" support, which improves performance on NVIDIA RTX cards in most cases. -* **streamingllm**: Experimental feature to avoid re-evaluating the entire prompt when part of it is removed, for instance, when you hit the context length for the model in chat mode and an old message is removed. -* **cpu**: Force a version of llama.cpp compiled without GPU acceleration to be used. Can usually be ignored. Only set this if you want to use CPU only and llama.cpp doesn't work otherwise. -* **no_mul_mat_q**: Disable the mul_mat_q kernel. This kernel usually improves generation speed significantly. This option to disable it is included in case it doesn't work on some system. -* **no-mmap**: Loads the model into memory at once, possibly preventing I/O operations later on at the cost of a longer load time. -* **mlock**: Force the system to keep the model in RAM rather than swapping or compressing (no idea what this means, never used it). -* **numa**: May improve performance on certain multi-cpu systems. - -### llamacpp_HF - -The same as llama.cpp but with transformers samplers, and using the transformers tokenizer instead of the internal llama.cpp tokenizer. - -To use it, you need to download a tokenizer. There are two options: - -1) Download `oobabooga/llama-tokenizer` under "Download model or LoRA". That's a default Llama tokenizer. -2) Place your .gguf in a subfolder of `models/` along with these 3 files: `tokenizer.model`, `tokenizer_config.json`, and `special_tokens_map.json`. This takes precedence over Option 1. - -It has an additional parameter: - -* **logits_all**: Needs to be checked if you want to evaluate the perplexity of the llama.cpp model using the "Training" > "Perplexity evaluation" tab. Otherwise, leave it unchecked, as it makes prompt processing slower. - -### AutoAWQ - -Loads: AWQ models. - -Example: https://huggingface.co/TheBloke/Phind-CodeLlama-34B-v2-AWQ - -The parameters are overall similar to AutoGPTQ. +* **ctx_size**: Context length of the model. +* **cpp_runner**: Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet. ## Model dropdown -Here you can select a model to be loaded, refresh the list of available models (🔄), load/unload/reload the selected model, and save the settings for the model. The "settings" are the values in the input fields (checkboxes, sliders, dropdowns) below this dropdown. +Here you can select a model to be loaded, refresh the list of available models, load/unload/reload the selected model, and save the settings for the model. The "settings" are the values in the input fields (checkboxes, sliders, dropdowns) below this dropdown. After saving, those settings will get restored whenever you select that model again in the dropdown menu. @@ -115,15 +122,15 @@ If the **Autoload the model** checkbox is selected, the model will be loaded as ## LoRA dropdown -Used to apply LoRAs to the model. Note that LoRA support is not implemented for all loaders. Check this [page](https://github.com/oobabooga/text-generation-webui/wiki) for details. +Used to apply LoRAs to the model. Note that LoRA support is not implemented for all loaders. Check the [What Works](https://github.com/oobabooga/text-generation-webui/wiki/What-Works) page for details. ## Download model or LoRA Here you can download a model or LoRA directly from the https://huggingface.co/ website. -* Models will be saved to `text-generation-webui/models`. -* LoRAs will be saved to `text-generation-webui/loras`. +* Models will be saved to `user_data/models`. +* LoRAs will be saved to `user_data/loras`. -In the input field, you can enter either the Hugging Face username/model path (like `facebook/galactica-125m`) or the full model URL (like `https://huggingface.co/facebook/galactica-125m`). To specify a branch, add it at the end after a ":" character like this: `facebook/galactica-125m:main`. +In the input field, you can enter either the Hugging Face username/model path (like `facebook/galactica-125m`) or the full model URL (like `https://huggingface.co/facebook/galactica-125m`). To specify a branch, add it at the end after a ":" character like this: `facebook/galactica-125m:main`. To download a single file, as necessary for models in GGUF format, you can click on "Get file list" after entering the model path in the input field, and then copy and paste the desired file name in the "File name" field before clicking on "Download". diff --git a/docs/06 - Session Tab.md b/docs/06 - Session Tab.md index fe96e5ca..48735c36 100644 --- a/docs/06 - Session Tab.md +++ b/docs/06 - Session Tab.md @@ -1,6 +1,15 @@ Here you can restart the UI with new settings. -* **Available extensions**: shows a list of extensions available under `text-generation-webui/extensions`. +## Settings + +* **Toggle light/dark theme**: switches between light and dark mode. +* **Show two columns in the Notebook tab**: toggles between the two-column Default layout and the single-column Notebook layout. +* **Turn long pasted text into attachments in the Chat tab**: when enabled, long pasted text is automatically converted into file attachments. +* **Include attachments/search results from previous messages in the chat prompt**: when enabled, attachments and web search results from earlier messages are included in subsequent prompts. + +## Extensions & flags + +* **Available extensions**: shows a list of extensions available under `text-generation-webui/extensions` and `text-generation-webui/user_data/extensions`. Note that some of these extensions may require manually installing Python requirements through the command: `pip install -r extensions/extension_name/requirements.txt`. * **Boolean command-line flags**: shows command-line flags of bool (true/false) type. After selecting your desired flags and extensions, you can restart the UI by clicking on **Apply flags/extensions and restart**. @@ -27,6 +36,6 @@ If you used the one-click installer, this command should be executed in the term ## Saving UI defaults -The **Save UI defaults to settings.yaml** button gathers the visible values in the UI and saves them to settings.yaml so that your settings will persist across multiple restarts of the UI. +The **Save extensions settings to user_data/settings.yaml** button gathers the visible values in the UI and saves them to `user_data/settings.yaml` so that your settings will persist across multiple restarts of the UI. Note that preset parameters like temperature are not individually saved, so you need to first save your preset and select it in the preset menu before saving the defaults. diff --git a/docs/07 - Extensions.md b/docs/07 - Extensions.md index ebcd3c0e..63bddf2c 100644 --- a/docs/07 - Extensions.md +++ b/docs/07 - Extensions.md @@ -21,17 +21,20 @@ If you create an extension, you are welcome to host it in a GitHub repository an |Extension|Description| |---------|-----------| |[openai](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai)| Creates an API that mimics the OpenAI API and can be used as a drop-in replacement. | -|[multimodal](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal) | Adds multimodality support (text+images). For a detailed description see [README.md](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal/README.md) in the extension directory. | -|[google_translate](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/google_translate)| Automatically translates inputs and outputs using Google Translate.| +|[Training_PRO](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/Training_PRO)| Advanced LoRA training with support for model and LoRA merging. | +|[superboogav2](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/superboogav2)| Enhanced RAG extension with support for PDF, DOCX, and PPTX files. | +|[send_pictures](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/send_pictures/)| Creates an image upload field that can be used to send images to the bot in chat mode. Captions are automatically generated using BLIP. | +|[coqui_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/coqui_tts)| Text-to-speech extension using Coqui XTTS v2. | |[silero_tts](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/silero_tts)| Text-to-speech extension using [Silero](https://github.com/snakers4/silero-models). When used in chat mode, responses are replaced with an audio widget. | |[whisper_stt](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/whisper_stt)| Allows you to enter your inputs in chat mode using your microphone. | -|[sd_api_pictures](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/sd_api_pictures)| Allows you to request pictures from the bot in chat mode, which will be generated using the AUTOMATIC1111 Stable Diffusion API. See examples [here](https://github.com/oobabooga/text-generation-webui/pull/309). | -|[character_bias](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/character_bias)| Just a very simple example that adds a hidden string at the beginning of the bot's reply in chat mode. | -|[send_pictures](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/send_pictures/)| Creates an image upload field that can be used to send images to the bot in chat mode. Captions are automatically generated using BLIP. | -|[gallery](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/gallery/)| Creates a gallery with the chat characters and their pictures. | -|[superbooga](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/superbooga)| An extension that uses ChromaDB to create an arbitrarily large pseudocontext, taking as input text files, URLs, or pasted text. Based on https://github.com/kaiokendev/superbig. | -|[ngrok](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/ngrok)| Allows you to access the web UI remotely using the ngrok reverse tunnel service (free). It's an alternative to the built-in Gradio `--share` feature. | |[perplexity_colors](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/perplexity_colors)| Colors each token in the output text by its associated probability, as derived from the model logits. | +|[google_translate](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/google_translate)| Automatically translates inputs and outputs using Google Translate.| +|[gallery](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/gallery/)| Creates a gallery with the chat characters and their pictures. | +|[sd_api_pictures](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/sd_api_pictures)| Allows you to request pictures from the bot in chat mode, which will be generated using the AUTOMATIC1111 Stable Diffusion API. See examples [here](https://github.com/oobabooga/text-generation-webui/pull/309). | +|[long_replies](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/long_replies)| Forces longer replies by suppressing early newlines in the model output. | +|[ngrok](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/ngrok)| Allows you to access the web UI remotely using the ngrok reverse tunnel service (free). It's an alternative to the built-in Gradio `--share` feature. | +|[superbooga](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/superbooga)| An extension that uses ChromaDB to create an arbitrarily large pseudocontext, taking as input text files, URLs, or pasted text. Based on https://github.com/kaiokendev/superbig. | +|[character_bias](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/character_bias)| Just a very simple example that adds a hidden string at the beginning of the bot's reply in chat mode. | ## How to write an extension @@ -51,8 +54,8 @@ The extensions framework is based on special functions and variables that you ca | `def history_modifier(history)` | Modifies the chat history before the text generation in chat mode begins. | | `def custom_generate_reply(...)` | Overrides the main text generation function. | | `def custom_generate_chat_prompt(...)` | Overrides the prompt generator in chat mode. | -| `def tokenizer_modifier(state, prompt, input_ids, input_embeds)` | Modifies the `input_ids`/`input_embeds` fed to the model. Should return `prompt`, `input_ids`, `input_embeds`. See the `multimodal` extension for an example. | -| `def custom_tokenized_length(prompt)` | Used in conjunction with `tokenizer_modifier`, returns the length in tokens of `prompt`. See the `multimodal` extension for an example. | +| `def tokenizer_modifier(state, prompt, input_ids, input_embeds)` | Modifies the `input_ids`/`input_embeds` fed to the model. Should return `prompt`, `input_ids`, `input_embeds`. See the `example` extension for a template. | +| `def custom_tokenized_length(prompt)` | Used in conjunction with `tokenizer_modifier`, returns the length in tokens of `prompt`. See the `example` extension for a template. | Additionally, you can define a special `params` dictionary. In it, the `display_name` key is used to define the displayed name of the extension in the UI, and the `is_tab` key is used to define whether the extension should appear in a new tab. By default, extensions appear at the bottom of the "Text generation" tab. @@ -186,7 +189,7 @@ def bot_prefix_modifier(string, state): def tokenizer_modifier(state, prompt, input_ids, input_embeds): """ Modifies the input ids and embeds. - Used by the multimodal extension to put image embeddings in the prompt. + Modifies the input ids and embeds fed to the model. Only used by loaders that use the transformers library for sampling. """ return prompt, input_ids, input_embeds diff --git a/docs/11 - AMD Setup.md b/docs/11 - AMD Setup.md index 0bd22e7e..6db7989d 100644 --- a/docs/11 - AMD Setup.md +++ b/docs/11 - AMD Setup.md @@ -1,13 +1,25 @@ ## Using an AMD GPU in Linux -Requires ROCm SDK 5.4.2 or 5.4.3 to be installed. Some systems may also -need: +Requires ROCm 6.4 to be installed. + +### Option 1: One-click installer + +The one-click installer (`start_linux.sh`) automatically detects AMD GPUs. When prompted, select the AMD option, or set the `GPU_CHOICE` environment variable before running: ``` -sudo apt-get install libstdc++-12-dev +GPU_CHOICE=B ./start_linux.sh ``` -Edit the "one_click.py" script using a text editor and un-comment and -modify the lines near the top of the script according to your setup. In -particular, modify the `os.environ["ROCM_PATH"] = '/opt/rocm'` line to -point to your ROCm installation. +### Option 2: Manual conda install + +Follow the manual conda installation instructions in the README, using the AMD PyTorch command: + +``` +pip3 install torch==2.9.1 --index-url https://download.pytorch.org/whl/rocm6.4 +``` + +Then install the project requirements with the AMD requirements file: + +``` +pip install -r requirements/full/requirements_amd.txt +``` diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index 94a95b10..cd5757f6 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -39,7 +39,7 @@ curl http://127.0.0.1:5000/v1/completions \ #### Chat completions -Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be guessed automatically based on the model name using the regex patterns in `models/config.yaml`. +Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be guessed automatically based on the model name using the regex patterns in `user_data/models/config.yaml`. ```shell curl http://127.0.0.1:5000/v1/chat/completions \ @@ -476,51 +476,45 @@ OPENAI_API_KEY=sk-111111111111111111111111111111111111111111111111 OPENAI_API_BASE=http://127.0.0.1:5000/v1 ``` -With the [official python openai client](https://github.com/openai/openai-python), the address can be set like this: +With the [official python openai client](https://github.com/openai/openai-python) (v1.x), the address can be set like this: ```python -import openai +from openai import OpenAI -openai.api_key = "..." -openai.api_base = "http://127.0.0.1:5000/v1" -openai.api_version = "2023-05-15" +client = OpenAI( + api_key="sk-111111111111111111111111111111111111111111111111", + base_url="http://127.0.0.1:5000/v1" +) + +response = client.chat.completions.create( + model="x", + messages=[{"role": "user", "content": "Hello!"}] +) +print(response.choices[0].message.content) ``` -If using .env files to save the `OPENAI_API_BASE` and `OPENAI_API_KEY` variables, make sure the .env file is loaded before the openai module is imported: - -```python -from dotenv import load_dotenv -load_dotenv() # make sure the environment variables are set before import -import openai -``` - -With the [official Node.js openai client](https://github.com/openai/openai-node) it is slightly more more complex because the environment variables are not used by default, so small source code changes may be required to use the environment variables, like so: +With the [official Node.js openai client](https://github.com/openai/openai-node) (v4.x): ```js -const openai = OpenAI( - Configuration({ - apiKey: process.env.OPENAI_API_KEY, - basePath: process.env.OPENAI_API_BASE - }) -); -``` +import OpenAI from "openai"; -For apps made with the [chatgpt-api Node.js client library](https://github.com/transitive-bullshit/chatgpt-api): - -```js -const api = new ChatGPTAPI({ +const client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, - apiBaseUrl: process.env.OPENAI_API_BASE + baseURL: "http://127.0.0.1:5000/v1", }); + +const response = await client.chat.completions.create({ + model: "x", + messages: [{ role: "user", content: "Hello!" }], +}); +console.log(response.choices[0].message.content); ``` ### Embeddings (alpha) -Embeddings requires `sentence-transformers` installed, but chat and completions will function without it loaded. The embeddings endpoint is currently using the HuggingFace model: `sentence-transformers/all-mpnet-base-v2` for embeddings. This produces 768 dimensional embeddings (the same as the text-davinci-002 embeddings), which is different from OpenAI's current default `text-embedding-ada-002` model which produces 1536 dimensional embeddings. The model is small-ish and fast-ish. This model and embedding size may change in the future. +Embeddings requires `sentence-transformers` installed, but chat and completions will function without it loaded. The embeddings endpoint is currently using the HuggingFace model: `sentence-transformers/all-mpnet-base-v2` for embeddings. This produces 768 dimensional embeddings. The model is small and fast. This model and embedding size may change in the future. | model name | dimensions | input max tokens | speed | size | Avg. performance | | ---------------------- | ---------- | ---------------- | ----- | ---- | ---------------- | -| text-embedding-ada-002 | 1536 | 8192 | - | - | - | -| text-davinci-002 | 768 | 2046 | - | - | - | | all-mpnet-base-v2 | 768 | 384 | 2800 | 420M | 63.3 | | all-MiniLM-L6-v2 | 384 | 256 | 14200 | 80M | 58.8 | @@ -528,50 +522,33 @@ In short, the all-MiniLM-L6-v2 model is 5x faster, 5x smaller ram, 2x smaller st Warning: You cannot mix embeddings from different models even if they have the same dimensions. They are not comparable. -### Compatibility & not so compatibility +### Compatibility -Note: the table below may be obsolete. - -| API endpoint | tested with | notes | -| ------------------------- | ---------------------------------- | --------------------------------------------------------------------------- | -| /v1/chat/completions | openai.ChatCompletion.create() | Use it with instruction following models | -| /v1/embeddings | openai.Embedding.create() | Using SentenceTransformer embeddings | -| /v1/images/generations | openai.Image.create() | Bare bones, no model configuration, response_format='b64_json' only. | -| /v1/moderations | openai.Moderation.create() | Basic initial support via embeddings | -| /v1/models | openai.Model.list() | Lists models, Currently loaded model first, plus some compatibility options | -| /v1/models/{id} | openai.Model.get() | returns whatever you ask for | -| /v1/edits | openai.Edit.create() | Removed, use /v1/chat/completions instead | -| /v1/text_completion | openai.Completion.create() | Legacy endpoint, variable quality based on the model | -| /v1/completions | openai api completions.create | Legacy endpoint (v0.25) | -| /v1/engines/\*/embeddings | python-openai v0.25 | Legacy endpoint | -| /v1/engines/\*/generate | openai engines.generate | Legacy endpoint | -| /v1/engines | openai engines.list | Legacy Lists models | -| /v1/engines/{model_name} | openai engines.get -i {model_name} | You can use this legacy endpoint to load models via the api or command line | -| /v1/images/edits | openai.Image.create_edit() | not yet supported | -| /v1/images/variations | openai.Image.create_variation() | not yet supported | -| /v1/audio/\* | openai.Audio.\* | supported | -| /v1/files\* | openai.Files.\* | not yet supported | -| /v1/fine-tunes\* | openai.FineTune.\* | not yet supported | -| /v1/search | openai.search, engines.search | not yet supported | +| API endpoint | notes | +| ------------------------- | --------------------------------------------------------------------------- | +| /v1/chat/completions | Use with instruction-following models. Supports streaming, tool calls. | +| /v1/completions | Text completion endpoint. | +| /v1/embeddings | Using SentenceTransformer embeddings. | +| /v1/images/generations | Image generation, response_format='b64_json' only. | +| /v1/moderations | Basic support via embeddings. | +| /v1/models | Lists models. Currently loaded model first. | +| /v1/models/{id} | Returns model info. | +| /v1/audio/\* | Supported. | +| /v1/images/edits | Not yet supported. | +| /v1/images/variations | Not yet supported. | #### Applications -Almost everything needs the `OPENAI_API_KEY` and `OPENAI_API_BASE` environment variable set, but there are some exceptions. +Almost everything needs the `OPENAI_API_KEY` and `OPENAI_API_BASE` environment variables set, but there are some exceptions. -Note: the table below may be obsolete. - -| Compatibility | Application/Library | Website | Notes | -| ------------- | ---------------------- | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| ✅❌ | openai-python (v0.25+) | https://github.com/openai/openai-python | only the endpoints from above are working. OPENAI_API_BASE=http://127.0.0.1:5001/v1 | -| ✅❌ | openai-node | https://github.com/openai/openai-node | only the endpoints from above are working. environment variables don't work by default, but can be configured (see above) | -| ✅❌ | chatgpt-api | https://github.com/transitive-bullshit/chatgpt-api | only the endpoints from above are working. environment variables don't work by default, but can be configured (see above) | -| ✅ | anse | https://github.com/anse-app/anse | API Key & URL configurable in UI, Images also work | -| ✅ | shell_gpt | https://github.com/TheR1D/shell_gpt | OPENAI_API_HOST=http://127.0.0.1:5001 | -| ✅ | gpt-shell | https://github.com/jla/gpt-shell | OPENAI_API_BASE=http://127.0.0.1:5001/v1 | -| ✅ | gpt-discord-bot | https://github.com/openai/gpt-discord-bot | OPENAI_API_BASE=http://127.0.0.1:5001/v1 | -| ✅ | OpenAI for Notepad++ | https://github.com/Krazal/nppopenai | api_url=http://127.0.0.1:5001 in the config file, or environment variables | -| ✅ | vscode-openai | https://marketplace.visualstudio.com/items?itemName=AndrewButson.vscode-openai | OPENAI_API_BASE=http://127.0.0.1:5001/v1 | -| ✅❌ | langchain | https://github.com/hwchase17/langchain | OPENAI_API_BASE=http://127.0.0.1:5001/v1 even with a good 30B-4bit model the result is poor so far. It assumes zero shot python/json coding. Some model tailored prompt formatting improves results greatly. | -| ✅❌ | Auto-GPT | https://github.com/Significant-Gravitas/Auto-GPT | OPENAI_API_BASE=http://127.0.0.1:5001/v1 Same issues as langchain. Also assumes a 4k+ context | -| ✅❌ | babyagi | https://github.com/yoheinakajima/babyagi | OPENAI_API_BASE=http://127.0.0.1:5001/v1 | -| ❌ | guidance | https://github.com/microsoft/guidance | logit_bias and logprobs not yet supported | +| Compatibility | Application/Library | Website | Notes | +| ------------- | -------------------- | ------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------- | +| ✅❌ | openai-python | https://github.com/openai/openai-python | Use `OpenAI(base_url="http://127.0.0.1:5000/v1")`. Only the endpoints from above work. | +| ✅❌ | openai-node | https://github.com/openai/openai-node | Use `new OpenAI({baseURL: "http://127.0.0.1:5000/v1"})`. See example above. | +| ✅ | anse | https://github.com/anse-app/anse | API Key & URL configurable in UI, Images also work. | +| ✅ | shell_gpt | https://github.com/TheR1D/shell_gpt | OPENAI_API_HOST=http://127.0.0.1:5000 | +| ✅ | gpt-shell | https://github.com/jla/gpt-shell | OPENAI_API_BASE=http://127.0.0.1:5000/v1 | +| ✅ | gpt-discord-bot | https://github.com/openai/gpt-discord-bot | OPENAI_API_BASE=http://127.0.0.1:5000/v1 | +| ✅ | OpenAI for Notepad++ | https://github.com/Krazal/nppopenai | api_url=http://127.0.0.1:5000 in the config file, or environment variables. | +| ✅ | vscode-openai | https://marketplace.visualstudio.com/items?itemName=AndrewButson.vscode-openai | OPENAI_API_BASE=http://127.0.0.1:5000/v1 | +| ✅❌ | langchain | https://github.com/hwchase17/langchain | Use `base_url="http://127.0.0.1:5000/v1"`. Results depend on model and prompt formatting. | diff --git a/docs/What Works.md b/docs/What Works.md index 80abdc7f..d9c85300 100644 --- a/docs/What Works.md +++ b/docs/What Works.md @@ -1,20 +1,19 @@ ## What Works -| Loader | Loading 1 LoRA | Loading 2 or more LoRAs | Training LoRAs | Multimodal extension | Perplexity evaluation | -|----------------|----------------|-------------------------|----------------|----------------------|-----------------------| -| Transformers | ✅ | ✅\*\* | ✅\* | ✅ | ✅ | -| llama.cpp | ❌ | ❌ | ❌ | ❌ | use llamacpp_HF | -| llamacpp_HF | ❌ | ❌ | ❌ | ❌ | ✅ | -| ExLlamav2_HF | ✅ | ✅ | ❌ | ❌ | ✅ | -| ExLlamav2 | ✅ | ✅ | ❌ | ❌ | use ExLlamav2_HF | -| AutoGPTQ | ✅ | ❌ | ❌ | ✅ | ✅ | -| AutoAWQ | ? | ❌ | ? | ? | ✅ | -| HQQ | ? | ? | ? | ? | ✅ | +| Loader | Loading LoRAs | Training LoRAs | Multimodal | Perplexity evaluation | +|----------------|---------------|----------------|------------|-----------------------| +| llama.cpp | ❌ | ❌ | ✅\* | ❌ | +| Transformers | ✅ | ✅ | ✅\*\* | ✅ | +| ExLlamav3_HF | ❌ | ❌ | ❌ | ✅ | +| ExLlamav3 | ❌ | ❌ | ✅ | ❌ | +| ExLlamav2_HF | ✅ | ❌ | ❌ | ✅ | +| ExLlamav2 | ✅ | ❌ | ❌ | ❌ | +| TensorRT-LLM | ❌ | ❌ | ❌ | ❌ | -❌ = not implemented +❌ = not supported -✅ = implemented +✅ = supported -\* Training LoRAs with GPTQ models also works with the Transformers loader. Make sure to check "auto-devices" and "disable_exllama" before loading the model. +\* Via the `mmproj` parameter (multimodal projector file). -\*\* Multi-LoRA in PEFT is tricky and the current implementation does not work reliably in all cases. +\*\* Via the `send_pictures` extension.