diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml index b9647b16..fb9e61b0 100644 --- a/.github/workflows/build-portable-release-cuda.yml +++ b/.github/workflows/build-portable-release-cuda.yml @@ -101,7 +101,7 @@ jobs: - name: Build Package shell: bash run: | - rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker + rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker # Define common variables CUDA_VERSION="${{ matrix.cuda }}" diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml index 287635a3..3724e384 100644 --- a/.github/workflows/build-portable-release-vulkan.yml +++ b/.github/workflows/build-portable-release-vulkan.yml @@ -100,7 +100,7 @@ jobs: - name: Build Package shell: bash run: | - rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker + rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker # Define common variables AVX_SUPPORT="${{ matrix.avx }}" diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml index 6e041966..bdf96cec 100644 --- a/.github/workflows/build-portable-release.yml +++ b/.github/workflows/build-portable-release.yml @@ -100,7 +100,7 @@ jobs: - name: Build Package shell: bash run: | - rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker + rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker # Define common variables AVX_SUPPORT="${{ matrix.avx }}" diff --git a/.gitignore b/.gitignore index 318e147d..bd69c941 100644 --- a/.gitignore +++ b/.gitignore @@ -1,26 +1,8 @@ -/cache -/characters /css /extensions -/grammars /installer_files -/logs -/loras -/models -/presets -/prompts /repositories -/softprompts -/torch-dumps -/training/datasets - -/CMD_FLAGS.txt -/img_bot* -/img_me* -/models/config-user.yaml -/notification.mp3 -/settings*.json -/settings*.yaml +/user_data .chroma .DS_Store diff --git a/CMD_FLAGS.txt b/CMD_FLAGS.txt deleted file mode 100644 index c2d63d9e..00000000 --- a/CMD_FLAGS.txt +++ /dev/null @@ -1,3 +0,0 @@ -# Only used by the one-click installer. -# Example: -# --listen --api diff --git a/README.md b/README.md index f62e3508..2f92ed06 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again. -You can use command-line flags, like `./start_linux.sh --help`, or add them to `CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`. +You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
@@ -157,7 +157,7 @@ mkdir -p logs cache # TORCH_CUDA_ARCH_LIST based on your GPU model # APP_RUNTIME_GID your host user's group id (run `id -g` in a terminal) # BUILD_EXTENIONS optionally add comma separated list of extensions to build -# Edit CMD_FLAGS.txt and add in it the options you want to execute (like --listen --cpu) +# Edit user_data/CMD_FLAGS.txt and add in it the options you want to execute (like --listen --cpu) # docker compose up --build ``` @@ -182,131 +182,139 @@ List of command-line flags ```txt -usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--settings SETTINGS] - [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices] [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]] - [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] - [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] - [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] - [--numa] [--no-kv-offload] [--row-split] [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] - [--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner] [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] - [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] - [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] - [--subpath SUBPATH] [--old-colors] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] - [--api-disable-ipv4] [--nowebui] +usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS] + [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] + [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit] + [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] + [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] + [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT] + [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner] + [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] + [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] + [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api] + [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] Text generation web UI options: - -h, --help show this help message and exit + -h, --help show this help message and exit Basic settings: - --multi-user Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly. - --character CHARACTER The name of the character to load in chat mode by default. - --model MODEL Name of the model to load by default. - --lora LORA [LORA ...] The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces. - --model-dir MODEL_DIR Path to directory with all the models. - --lora-dir LORA_DIR Path to directory with all the loras. - --settings SETTINGS Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this - file will be loaded by default without the need to use the --settings flag. - --extensions EXTENSIONS [EXTENSIONS ...] The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. - --verbose Print the prompts to the terminal. - --idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again. + --multi-user Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly. + --character CHARACTER The name of the character to load in chat mode by default. + --model MODEL Name of the model to load by default. + --lora LORA [LORA ...] The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces. + --model-dir MODEL_DIR Path to directory with all the models. + --lora-dir LORA_DIR Path to directory with all the loras. + --model-menu Show a model menu in the terminal when the web UI is first launched. + --settings SETTINGS Load the default interface settings from this yaml file. See user_data/settings-template.yaml for an example. If you create a file called + user_data/settings.yaml, this file will be loaded by default without the need to use the --settings flag. + --extensions EXTENSIONS [EXTENSIONS ...] The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. + --verbose Print the prompts to the terminal. + --idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again. Model loader: - --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, - HQQ, TensorRT-LLM. + --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, + TensorRT-LLM. Transformers/Accelerate: - --cpu Use the CPU to generate text. Warning: Training on CPU is extremely slow. - --auto-devices Automatically split the model across the available GPU(s) and CPU. - --gpu-memory GPU_MEMORY [GPU_MEMORY ...] Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values - in MiB like --gpu-memory 3500MiB. - --cpu-memory CPU_MEMORY Maximum CPU memory in GiB to allocate for offloaded weights. Same as above. - --disk If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. - --disk-cache-dir DISK_CACHE_DIR Directory to save the disk cache to. Defaults to "cache". - --load-in-8bit Load the model with 8-bit precision (using bitsandbytes). - --bf16 Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. - --no-cache Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost. - --trust-remote-code Set trust_remote_code=True while loading the model. Necessary for some models. - --force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution. - --no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. - --use_flash_attention_2 Set use_flash_attention_2=True while loading the model. - --use_eager_attention Set attn_implementation= eager while loading the model. - --torch-compile Compile the model with torch.compile for improved performance. + --cpu Use the CPU to generate text. Warning: Training on CPU is extremely slow. + --cpu-memory CPU_MEMORY Maximum CPU memory in GiB. Use this for CPU offloading. + --disk If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. + --disk-cache-dir DISK_CACHE_DIR Directory to save the disk cache to. Defaults to "user_data/cache". + --load-in-8bit Load the model with 8-bit precision (using bitsandbytes). + --bf16 Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. + --no-cache Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost. + --trust-remote-code Set trust_remote_code=True while loading the model. Necessary for some models. + --force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution. + --no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. + --use_flash_attention_2 Set use_flash_attention_2=True while loading the model. + --use_eager_attention Set attn_implementation= eager while loading the model. + --torch-compile Compile the model with torch.compile for improved performance. bitsandbytes 4-bit: - --load-in-4bit Load the model with 4-bit precision (using bitsandbytes). - --use_double_quant use_double_quant for 4-bit. - --compute_dtype COMPUTE_DTYPE compute dtype for 4-bit. Valid options: bfloat16, float16, float32. - --quant_type QUANT_TYPE quant_type for 4-bit. Valid options: nf4, fp4. + --load-in-4bit Load the model with 4-bit precision (using bitsandbytes). + --use_double_quant use_double_quant for 4-bit. + --compute_dtype COMPUTE_DTYPE compute dtype for 4-bit. Valid options: bfloat16, float16, float32. + --quant_type QUANT_TYPE quant_type for 4-bit. Valid options: nf4, fp4. llama.cpp: - --flash-attn Use flash-attention. - --n_ctx N_CTX Size of the prompt context. - --threads THREADS Number of threads to use. - --threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing. - --batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval. - --no-mmap Prevent mmap from being used. - --mlock Force the system to keep the model in RAM. - --n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU. - --tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40. - --numa Activate NUMA task allocation for llama.cpp. - --no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. - --row-split Split the model by rows across GPUs. This may improve multi-gpu performance. + --flash-attn Use flash-attention. + --threads THREADS Number of threads to use. + --threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing. + --batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval. + --no-mmap Prevent mmap from being used. + --mlock Force the system to keep the model in RAM. + --n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU. + --tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40. + --numa Activate NUMA task allocation for llama.cpp. + --no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. + --row-split Split the model by rows across GPUs. This may improve multi-gpu performance. + --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU" + --streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed. + +Context and cache management: + --ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens. + +Speculative decoding: + --model-draft MODEL_DRAFT Path to the draft model for speculative decoding. + --draft-max DRAFT_MAX Number of tokens to draft for speculative decoding. + --gpu-layers-draft GPU_LAYERS_DRAFT Number of layers to offload to the GPU for the draft model. + --device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1 + --ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model. ExLlamaV2: - --gpu-split GPU_SPLIT Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. - --autosplit Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored. - --max_seq_len MAX_SEQ_LEN Maximum sequence length. - --cfg-cache ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader. - --no_flash_attn Force flash-attention to not be used. - --no_xformers Force xformers to not be used. - --no_sdpa Force Torch SDPA to not be used. - --num_experts_per_token NUM_EXPERTS_PER_TOKEN Number of experts to use for generation. Applies to MoE models like Mixtral. - --enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2. + --gpu-split GPU_SPLIT Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. + --autosplit Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored. + --cfg-cache ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader. + --no_flash_attn Force flash-attention to not be used. + --no_xformers Force xformers to not be used. + --no_sdpa Force Torch SDPA to not be used. + --num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral. + --enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2. HQQ: - --hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN. + --hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN. TensorRT-LLM: - --cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet. + --cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet. Cache: - --cache_type CACHE_TYPE KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4. + --cache_type CACHE_TYPE KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4. DeepSpeed: - --deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. - --nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading. - --local_rank LOCAL_RANK DeepSpeed: Optional argument for distributed setups. + --deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. + --nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading. + --local_rank LOCAL_RANK DeepSpeed: Optional argument for distributed setups. RoPE: - --alpha_value ALPHA_VALUE Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. - --rope_freq_base ROPE_FREQ_BASE If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63). - --compress_pos_emb COMPRESS_POS_EMB Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale. + --alpha_value ALPHA_VALUE Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. + --rope_freq_base ROPE_FREQ_BASE If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63). + --compress_pos_emb COMPRESS_POS_EMB Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale. Gradio: - --listen Make the web UI reachable from your local network. - --listen-port LISTEN_PORT The listening port that the server will use. - --listen-host LISTEN_HOST The hostname that the server will use. - --share Create a public URL. This is useful for running the web UI on Google Colab or similar. - --auto-launch Open the web UI in the default browser upon launch. - --gradio-auth GRADIO_AUTH Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3". - --gradio-auth-path GRADIO_AUTH_PATH Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above. - --ssl-keyfile SSL_KEYFILE The path to the SSL certificate key file. - --ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file. - --subpath SUBPATH Customize the subpath for gradio, use with reverse proxy - --old-colors Use the legacy Gradio colors, before the December/2024 update. + --listen Make the web UI reachable from your local network. + --listen-port LISTEN_PORT The listening port that the server will use. + --listen-host LISTEN_HOST The hostname that the server will use. + --share Create a public URL. This is useful for running the web UI on Google Colab or similar. + --auto-launch Open the web UI in the default browser upon launch. + --gradio-auth GRADIO_AUTH Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3". + --gradio-auth-path GRADIO_AUTH_PATH Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above. + --ssl-keyfile SSL_KEYFILE The path to the SSL certificate key file. + --ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file. + --subpath SUBPATH Customize the subpath for gradio, use with reverse proxy + --old-colors Use the legacy Gradio colors, before the December/2024 update. API: - --api Enable the API extension. - --public-api Create a public URL for the API using Cloudfare. - --public-api-id PUBLIC_API_ID Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. - --api-port API_PORT The listening port for the API. - --api-key API_KEY API authentication key. - --admin-key ADMIN_KEY API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key. - --api-enable-ipv6 Enable IPv6 for the API - --api-disable-ipv4 Disable IPv4 for the API - --nowebui Do not launch the Gradio UI. Useful for launching the API in standalone mode. + --api Enable the API extension. + --public-api Create a public URL for the API using Cloudfare. + --public-api-id PUBLIC_API_ID Tunnel ID for named Cloudflare Tunnel. Use together with public-api option. + --api-port API_PORT The listening port for the API. + --api-key API_KEY API authentication key. + --admin-key ADMIN_KEY API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key. + --api-enable-ipv6 Enable IPv6 for the API + --api-disable-ipv4 Disable IPv4 for the API + --nowebui Do not launch the Gradio UI. Useful for launching the API in standalone mode. ```
@@ -317,35 +325,37 @@ https://github.com/oobabooga/text-generation-webui/wiki ## Downloading models -Models should be placed in the folder `text-generation-webui/models`. They are usually downloaded from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads). +Models should be placed in the folder `text-generation-webui/user_data/models`. They are usually downloaded from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads). -* GGUF models are a single file and should be placed directly into `models`. Example: +* GGUF models are a single file and should be placed directly into `user_data/models`. Example: ``` text-generation-webui -└── models - └── llama-2-13b-chat.Q4_K_M.gguf +└── user_data + └── models + └── llama-2-13b-chat.Q4_K_M.gguf ``` * The remaining model types (like 16-bit Transformers models and EXL2 models) are made of several files and must be placed in a subfolder. Example: ``` text-generation-webui -├── models -│   ├── lmsys_vicuna-33b-v1.3 -│   │   ├── config.json -│   │   ├── generation_config.json -│   │   ├── pytorch_model-00001-of-00007.bin -│   │   ├── pytorch_model-00002-of-00007.bin -│   │   ├── pytorch_model-00003-of-00007.bin -│   │   ├── pytorch_model-00004-of-00007.bin -│   │   ├── pytorch_model-00005-of-00007.bin -│   │   ├── pytorch_model-00006-of-00007.bin -│   │   ├── pytorch_model-00007-of-00007.bin -│   │   ├── pytorch_model.bin.index.json -│   │   ├── special_tokens_map.json -│   │   ├── tokenizer_config.json -│   │   └── tokenizer.model +└── user_data + └── models + └── lmsys_vicuna-33b-v1.3 + ├── config.json + ├── generation_config.json + ├── pytorch_model-00001-of-00007.bin + ├── pytorch_model-00002-of-00007.bin + ├── pytorch_model-00003-of-00007.bin + ├── pytorch_model-00004-of-00007.bin + ├── pytorch_model-00005-of-00007.bin + ├── pytorch_model-00006-of-00007.bin + ├── pytorch_model-00007-of-00007.bin + ├── pytorch_model.bin.index.json + ├── special_tokens_map.json + ├── tokenizer_config.json + └── tokenizer.model ``` In both cases, you can use the "Model" tab of the UI to download the model from Hugging Face automatically. It is also possible to download it via the command-line with: diff --git a/cmd_wsl.bat b/cmd_wsl.bat deleted file mode 100755 index f9f4348a..00000000 --- a/cmd_wsl.bat +++ /dev/null @@ -1,11 +0,0 @@ -@echo off - -cd /D "%~dp0" - -set PATH=%PATH%;%SystemRoot%\system32 - -@rem sed -i 's/\x0D$//' ./wsl.sh converts newlines to unix format in the wsl script -call wsl -e bash -lic "sed -i 's/\x0D$//' ./wsl.sh; source ./wsl.sh cmd" - -:end -pause diff --git a/css/main.css b/css/main.css index a3fa9753..d6e5ac83 100644 --- a/css/main.css +++ b/css/main.css @@ -625,19 +625,19 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { width: 100%; overflow-y: visible; } - + .message { break-inside: avoid; } - + .gradio-container { overflow: visible; } - + .tab-nav { display: none !important; } - + #chat-tab > :first-child { max-width: unset; } @@ -1291,3 +1291,94 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .dark .footer-button:hover svg { stroke: rgb(209 213 219); } + +.tgw-accordion { + padding: 10px 12px !important; +} + +.dark .tgw-accordion { + border: 1px solid var(--border-color-dark); +} + +.welcome-greeting { + text-align: center; + margin-top: 40vh; + font-size: 24px; + opacity: 0.7; + padding-left: 1rem; + padding-right: 1rem; +} + +/* Thinking blocks styling */ +.thinking-block { + margin-bottom: 12px; + border-radius: 8px; + border: 1px solid rgb(0 0 0 / 10%); + background-color: var(--light-theme-gray); + overflow: hidden; +} + +.dark .thinking-block { + background-color: var(--darker-gray); +} + +.thinking-header { + display: flex; + align-items: center; + padding: 10px 16px; + cursor: pointer; + user-select: none; + font-size: 14px; + color: rgb(0 0 0 / 70%); + transition: background-color 0.2s; +} + +.thinking-header:hover { + background-color: rgb(0 0 0 / 3%); +} + +.thinking-header::-webkit-details-marker { + display: none; +} + +.thinking-icon { + margin-right: 8px; + color: rgb(0 0 0 / 50%); +} + +.thinking-title { + font-weight: 500; +} + +.thinking-content { + padding: 12px 16px; + border-top: 1px solid rgb(0 0 0 / 7%); + color: rgb(0 0 0 / 70%); + font-size: 14px; + line-height: 1.5; + overflow-wrap: break-word; + max-height: 250px; + overflow-y: scroll; + contain: layout; +} + +/* Animation for opening thinking blocks */ +@keyframes fadeIn { + from { opacity: 0; } + to { opacity: 1; } +} + +.thinking-block[open] .thinking-content { + animation: fadeIn 0.3s ease-out; +} + +/* Additional style for in-progress thinking */ +.thinking-block[data-streaming="true"] .thinking-title { + animation: pulse 1.5s infinite; +} + +@keyframes pulse { + 0% { opacity: 0.6; } + 50% { opacity: 1; } + 100% { opacity: 0.6; } +} diff --git a/docs/10 - WSL.md b/docs/10 - WSL.md deleted file mode 100644 index e0d66393..00000000 --- a/docs/10 - WSL.md +++ /dev/null @@ -1,146 +0,0 @@ -## WSL instructions - -If you do not have WSL installed, follow the [instructions below](https://github.com/oobabooga/text-generation-webui/wiki/10-%E2%80%90-WSL#wsl-installation) first. - -### Additional WSL setup info - -If you want to install Linux to a drive other than C, open powershell and enter these commands: - -``` -cd D:\Path\To\Linux -$ProgressPreference = 'SilentlyContinue' -Invoke-WebRequest -Uri -OutFile Linux.appx -UseBasicParsing -mv Linux.appx Linux.zip -``` - -Then open Linux.zip and you should see several .appx files inside. - -The one with _x64.appx contains the exe installer that you need. - -Extract the contents of that _x64.appx file and run .exe to install. - -Linux Distro URLs: https://learn.microsoft.com/en-us/windows/wsl/install-manual#downloading-distributions - -**ENSURE THAT THE WSL LINUX DISTRO THAT YOU WISH TO USE IS SET AS THE DEFAULT!** - -Do this by using these commands: - -``` -wsl -l -wsl -s -``` - -### Web UI Installation - -Run the "start" script. By default it will install the web UI in WSL: -/home/{username}/text-gen-install - -To launch the web UI in the future after it is already installed, run -the same "start" script. Ensure that one_click.py and wsl.sh are next to it! - -### Updating the web UI - -As an alternative to running the "update" script, you can also run "wsl.sh update" in WSL. - -### Running an interactive shell - -As an alternative to running the "cmd" script, you can also run "wsl.sh cmd" in WSL. - -### Changing the default install location - -To change this, you will need to edit the scripts as follows: -wsl.sh: line ~22 INSTALL_DIR="/path/to/install/dir" - -Keep in mind that there is a long-standing bug in WSL that significantly -slows drive read/write speeds when using a physical drive as opposed to -the virtual one that Linux is installed in. - -## WSL installation - -Guide created by [@jfryton](https://github.com/jfryton). Thank you jfryton. - ------ - -Here's an easy-to-follow, step-by-step guide for installing Windows Subsystem for Linux (WSL) with Ubuntu on Windows 10/11: - -### Step 1: Enable WSL - -1. Press the Windows key + X and click on "Windows PowerShell (Admin)" or "Windows Terminal (Admin)" to open PowerShell or Terminal with administrator privileges. -2. In the PowerShell window, type the following command and press Enter: - -``` -wsl --install -``` - -If this command doesn't work, you can enable WSL with the following command for Windows 10: - -``` -wsl --set-default-version 1 -``` - -For Windows 11, you can use: - -``` -wsl --set-default-version 2 -``` - -You may be prompted to restart your computer. If so, save your work and restart. - -### Step 2: Install Ubuntu - -1. Open the Microsoft Store. -2. Search for "Ubuntu" in the search bar. -3. Choose the desired Ubuntu version (e.g., Ubuntu 20.04 LTS) and click "Get" or "Install" to download and install the Ubuntu app. -4. Once the installation is complete, click "Launch" or search for "Ubuntu" in the Start menu and open the app. - -### Step 3: Set up Ubuntu - -1. When you first launch the Ubuntu app, it will take a few minutes to set up. Be patient as it installs the necessary files and sets up your environment. -2. Once the setup is complete, you will be prompted to create a new UNIX username and password. Choose a username and password, and make sure to remember them, as you will need them for future administrative tasks within the Ubuntu environment. - -### Step 4: Update and upgrade packages - -1. After setting up your username and password, it's a good idea to update and upgrade your Ubuntu system. Run the following commands in the Ubuntu terminal: - -``` -sudo apt update -sudo apt upgrade -``` - -2. Enter your password when prompted. This will update the package list and upgrade any outdated packages. - -Congratulations! You have now installed WSL with Ubuntu on your Windows 10/11 system. You can use the Ubuntu terminal for various tasks, like running Linux commands, installing packages, or managing files. - -You can launch your WSL Ubuntu installation by selecting the Ubuntu app (like any other program installed on your computer) or typing 'ubuntu' into Powershell or Terminal. - -### Step 5: Proceed with Linux instructions - -1. You can now follow the Linux setup instructions. If you receive any error messages about a missing tool or package, just install them using apt: - -``` -sudo apt install [missing package] -``` - -You will probably need to install build-essential - -``` -sudo apt install build-essential -``` - -If you face any issues or need to troubleshoot, you can always refer to the official Microsoft documentation for WSL: https://docs.microsoft.com/en-us/windows/wsl/ - -### WSL2 performance using /mnt: - -When you git clone a repository, put it inside WSL and not outside. To understand more, take a look at this [issue](https://github.com/microsoft/WSL/issues/4197#issuecomment-604592340) - -### Bonus: Port Forwarding - -By default, you won't be able to access the webui from another device on your local network. You will need to setup the appropriate port forwarding using the following steps: - -1. First, get the IP address of the WSL by typing `wsl hostname -I`. This will output the IP address, for example `172.20.134.111`. -2. Then, use the following command (using PowerShell or Terminal with administrator privileges) to set up port forwarding, replacing `172.20.134.111` with the IP address you obtained in step 1: - -``` -netsh interface portproxy add v4tov4 listenaddress=0.0.0.0 listenport=7860 connectaddress=172.20.134.111 connectport=7860 -``` - diff --git a/download-model.py b/download-model.py index 8ff1d69c..25517491 100644 --- a/download-model.py +++ b/download-model.py @@ -1,5 +1,5 @@ ''' -Downloads models from Hugging Face to models/username_modelname. +Downloads models from Hugging Face to user_data/models/username_modelname. Example: python download-model.py facebook/opt-1.3b @@ -175,7 +175,7 @@ class ModelDownloader: if model_dir: base_folder = model_dir else: - base_folder = 'models' if not is_lora else 'loras' + base_folder = 'user_data/models' if not is_lora else 'user_data/loras' # If the model is of type GGUF, save directly in the base_folder if is_llamacpp: @@ -356,7 +356,7 @@ if __name__ == '__main__': parser.add_argument('--specific-file', type=str, default=None, help='Name of the specific file to download (if not provided, downloads all).') parser.add_argument('--exclude-pattern', type=str, default=None, help='Regex pattern to exclude files from download.') parser.add_argument('--output', type=str, default=None, help='Save the model files to this folder.') - parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/models).') + parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/user_data/models).') parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.') parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.') parser.add_argument('--max-retries', type=int, default=7, help='Max retries count when get error in download time.') diff --git a/extensions/Training_PRO/matplotgraph.py b/extensions/Training_PRO/matplotgraph.py index 348fc01a..b30bee83 100644 --- a/extensions/Training_PRO/matplotgraph.py +++ b/extensions/Training_PRO/matplotgraph.py @@ -59,4 +59,4 @@ def create_graph(lora_path, lora_name): print(f"File 'training_graph.json' does not exist in the {lora_path}") except ImportError: - print("matplotlib is not installed. Please install matplotlib to create PNG graphs") \ No newline at end of file + print("matplotlib is not installed. Please install matplotlib to create PNG graphs") diff --git a/extensions/Training_PRO/script.py b/extensions/Training_PRO/script.py index f553e482..cb11a8df 100644 --- a/extensions/Training_PRO/script.py +++ b/extensions/Training_PRO/script.py @@ -175,23 +175,23 @@ def ui(): with gr.Row(): with gr.Column(): with gr.Row(): - dataset = gr.Dropdown(choices=get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown']) - create_refresh_button(dataset, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'json')}, 'refresh-button') + dataset = gr.Dropdown(choices=get_datasets('user_data/training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown']) + create_refresh_button(dataset, lambda: None, lambda: {'choices': get_datasets('user_data/training/datasets', 'json')}, 'refresh-button') with gr.Row(): - eval_dataset = gr.Dropdown(choices=get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown']) - create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'json')}, 'refresh-button') + eval_dataset = gr.Dropdown(choices=get_datasets('user_data/training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown']) + create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': get_datasets('user_data/training/datasets', 'json')}, 'refresh-button') with gr.Column(): with gr.Row(): - format = gr.Dropdown(choices=get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown']) - create_refresh_button(format, lambda: None, lambda: {'choices': get_datasets('training/formats', 'json')}, 'refresh-button') + format = gr.Dropdown(choices=get_datasets('user_data/training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown']) + create_refresh_button(format, lambda: None, lambda: {'choices': get_datasets('user_data/training/formats', 'json')}, 'refresh-button') with gr.Row(): eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.') with gr.Tab(label="Text file"): with gr.Row(): - raw_text_file = gr.Dropdown(choices=get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The text file to use for training.', elem_classes=['slim-dropdown']) - create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'txt')}, 'refresh-button') + raw_text_file = gr.Dropdown(choices=get_datasets('user_data/training/datasets', 'txt'), value='None', label='Text file', info='The text file to use for training.', elem_classes=['slim-dropdown']) + create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': get_datasets('user_data/training/datasets', 'txt')}, 'refresh-button') with gr.Row(): with gr.Column(): @@ -208,7 +208,7 @@ def ui(): download_file_url = gr.Textbox(label='Download JSON or txt file to datasets (or formats) folder', value='',info='The URL of a file to download. If on github, make sure you get url of the raw file (https://raw.githubusercontent.com/...). If huggin face, make sure the url has /resolve/ in it not /blob/') with gr.Row(): download_check_overwrite = gr.Checkbox(label='Overwrite', value=False, info='Overwrite if file exist') - download_folder = gr.Radio(label="Destination", value='training/datasets', choices=['training/datasets', 'training/formats'], interactive=True) + download_folder = gr.Radio(label="Destination", value='user_data/training/datasets', choices=['user_data/training/datasets', 'user_data/training/formats'], interactive=True) download_button = gr.Button('Download') download_status = gr.Textbox(label='Download Status', value='', interactive=False) with gr.Row(): @@ -235,7 +235,7 @@ def ui(): with gr.Row(): with gr.Column(): models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True) - evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.') + evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + get_datasets('user_data/training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under user_data/training/datasets.') with gr.Row(): with gr.Column(): stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.') @@ -310,7 +310,7 @@ def ui(): if raw_text_file not in ['None', '']: logger.info("Loading Text file...") - fullpath = clean_path('training/datasets', f'{raw_text_file}') + fullpath = clean_path('user_data/training/datasets', f'{raw_text_file}') fullpath = Path(fullpath) if fullpath.is_dir(): logger.info('Training path directory {}'.format(raw_text_file)) @@ -324,10 +324,10 @@ def ui(): logger.info(f"Loaded training file: {file_path.name}") else: try: - with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file: + with open(clean_path('user_data/training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file: raw_text = file.read().replace('\r', '') except: - yield f"{raw_text_file}.txt doesn't seem to exsist anymore... check your training/datasets folder" + yield f"{raw_text_file}.txt doesn't seem to exsist anymore... check your user_data/training/datasets folder" return @@ -353,7 +353,7 @@ def ui(): yield "Select format choice for dataset." return - with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile: + with open(clean_path('user_data/training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile: format_data: dict[str, str] = json.load(formatFile) def generate_prompt(data_point: dict[str, str]): @@ -381,7 +381,7 @@ def ui(): return tokenize_dummy(prompt) logger.info("Loading JSON datasets...") - data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json')) + data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json')) data_keys = [] @@ -456,7 +456,7 @@ def ui(): #debug_slicer.change(lambda x: non_serialized_params.update({"debug_slicer": x}), debug_slicer, None) def update_dataset(): - return gr.update(choices=get_datasets('training/datasets', 'json')), gr.update(choices=get_datasets('training/datasets', 'txt')) + return gr.update(choices=get_datasets('user_data/training/datasets', 'json')), gr.update(choices=get_datasets('user_data/training/datasets', 'txt')) download_button.click(download_file_from_url, [download_file_url,download_check_overwrite,download_folder] , download_status).then(update_dataset,None,[dataset , raw_text_file]) @@ -670,7 +670,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch if raw_text_file not in ['None', '']: train_template["template_type"] = "raw_text" logger.info("Loading text file...") - fullpath = clean_path('training/datasets', f'{raw_text_file}') + fullpath = clean_path('user_data/training/datasets', f'{raw_text_file}') fullpath = Path(fullpath) if fullpath.is_dir(): logger.info('Training path directory {}'.format(raw_text_file)) @@ -683,7 +683,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch logger.info(f"Loaded training file: {file_path.name}") else: - with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file: + with open(clean_path('user_data/training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file: raw_text = file.read().replace('\r', '') # FPHAM PRECISE SLICING @@ -720,7 +720,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch train_template["template_type"] = "dataset" - with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile: + with open(clean_path('user_data/training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile: format_data: dict[str, str] = json.load(formatFile) # == store training prompt == @@ -742,7 +742,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch return tokenize(prompt, add_eos_token, add_bos_token) logger.info("Loading JSON datasets...") - data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json')) + data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json')) train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30)) print(f"BOS: {add_bos_token} EOS: {add_eos_token}") @@ -751,7 +751,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch if eval_dataset == 'None': eval_data = None else: - eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json')) + eval_data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{eval_dataset}.json')) eval_data = eval_data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30)) # == We MUST reload model if it went through any previous training, even failed one == @@ -1157,11 +1157,11 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch decoded_entries.append({"value": decoded_text}) # Write the log file - Path('logs').mkdir(exist_ok=True) - with open(Path('logs/train_dataset_sample.json'), 'w') as json_file: + Path('user_data/logs').mkdir(exist_ok=True) + with open(Path('user_data/logs/train_dataset_sample.json'), 'w') as json_file: json.dump(decoded_entries, json_file, indent=4) - logger.info("Log file 'train_dataset_sample.json' created in the 'logs' directory.") + logger.info("Log file 'train_dataset_sample.json' created in the 'user_data/logs' directory.") except Exception as e: logger.error(f"Failed to create log file due to error: {e}") diff --git a/extensions/Training_PRO/train_utils.py b/extensions/Training_PRO/train_utils.py index 18686144..79994880 100644 --- a/extensions/Training_PRO/train_utils.py +++ b/extensions/Training_PRO/train_utils.py @@ -194,13 +194,13 @@ def precise_cut(text: str, overlap: bool, min_chars_cut: int, eos_to_hc: bool, c if debug_slicer: # Write the log file - Path('logs').mkdir(exist_ok=True) + Path('user_data/logs').mkdir(exist_ok=True) sentencelist_dict = {index: sentence for index, sentence in enumerate(sentencelist)} - output_file = "logs/sentencelist.json" + output_file = "user_data/logs/sentencelist.json" with open(output_file, 'w') as f: json.dump(sentencelist_dict, f,indent=2) - print("Saved sentencelist.json in logs folder") + print("Saved sentencelist.json in user_data/logs folder") return sentencelist @@ -281,13 +281,13 @@ def sliding_block_cut(text: str, min_chars_cut: int, eos_to_hc: bool, cutoff_len if debug_slicer: # Write the log file - Path('logs').mkdir(exist_ok=True) + Path('user_data/logs').mkdir(exist_ok=True) sentencelist_dict = {index: sentence for index, sentence in enumerate(sentencelist)} - output_file = "logs/sentencelist.json" + output_file = "user_data/logs/sentencelist.json" with open(output_file, 'w') as f: json.dump(sentencelist_dict, f,indent=2) - print("Saved sentencelist.json in logs folder") + print("Saved sentencelist.json in user_data/logs folder") return sentencelist diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py index 76be4a58..8b242fb6 100644 --- a/extensions/gallery/script.py +++ b/extensions/gallery/script.py @@ -72,13 +72,13 @@ def generate_html(): global cards cards = [] # Iterate through files in image folder - for file in sorted(Path("characters").glob("*")): + for file in sorted(Path("user_data/characters").glob("*")): if file.suffix in [".json", ".yml", ".yaml"]: character = file.stem container_html = '
' image_html = "
" - for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]: + for path in [Path(f"user_data/characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]: if path.exists(): image_html = f'' break diff --git a/extensions/openai/script.py b/extensions/openai/script.py index f907cdbb..c2dc337b 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -86,6 +86,20 @@ app.add_middleware( ) +@app.middleware("http") +async def validate_host_header(request: Request, call_next): + # Be strict about only approving access to localhost by default + if not (shared.args.listen or shared.args.public_api): + host = request.headers.get("host", "").split(":")[0] + if host not in ["localhost", "127.0.0.1"]: + return JSONResponse( + status_code=400, + content={"detail": "Invalid host header"} + ) + + return await call_next(request) + + @app.options("/", dependencies=check_key) async def options_route(): return JSONResponse(content="OK") @@ -236,6 +250,11 @@ async def handle_moderations(request: Request): return JSONResponse(response) +@app.get("/v1/internal/health", dependencies=check_key) +async def handle_health_check(): + return JSONResponse(content={"status": "ok"}) + + @app.post("/v1/internal/encode", response_model=EncodeResponse, dependencies=check_key) async def handle_token_encode(request_data: EncodeRequest): response = token_encode(request_data.text) diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index ea688897..4d6018f9 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -6,7 +6,7 @@ from pydantic import BaseModel, Field class GenerationOptions(BaseModel): - preset: str | None = Field(default=None, description="The name of a file under text-generation-webui/presets (without the .yaml extension). The sampling parameters that get overwritten by this option are the keys in the default_preset() function in modules/presets.py.") + preset: str | None = Field(default=None, description="The name of a file under text-generation-webui/user_data/presets (without the .yaml extension). The sampling parameters that get overwritten by this option are the keys in the default_preset() function in modules/presets.py.") dynatemp_low: float = 1 dynatemp_high: float = 1 dynatemp_exponent: float = 1 @@ -103,10 +103,10 @@ class ChatCompletionRequestParams(BaseModel): mode: str = Field(default='instruct', description="Valid options: instruct, chat, chat-instruct.") - instruction_template: str | None = Field(default=None, description="An instruction template defined under text-generation-webui/instruction-templates. If not set, the correct template will be automatically obtained from the model metadata.") + instruction_template: str | None = Field(default=None, description="An instruction template defined under text-generation-webui/user_data/instruction-templates. If not set, the correct template will be automatically obtained from the model metadata.") instruction_template_str: str | None = Field(default=None, description="A Jinja2 instruction template. If set, will take precedence over everything else.") - character: str | None = Field(default=None, description="A character defined under text-generation-webui/characters. If not set, the default \"Assistant\" character will be used.") + character: str | None = Field(default=None, description="A character defined under text-generation-webui/user_data/characters. If not set, the default \"Assistant\" character will be used.") bot_name: str | None = Field(default=None, description="Overwrites the value set by character field.", alias="name2") context: str | None = Field(default=None, description="Overwrites the value set by character field.") greeting: str | None = Field(default=None, description="Overwrites the value set by character field.") diff --git a/extensions/superboogav2/chromadb.py b/extensions/superboogav2/chromadb.py index c9e450e4..6e93dd92 100644 --- a/extensions/superboogav2/chromadb.py +++ b/extensions/superboogav2/chromadb.py @@ -148,7 +148,7 @@ class ChromaCollector(): id_ = new_ids[i] metadata = metadatas[i] if metadatas is not None else None embedding = self.embeddings_cache.get(text) - if embedding is not None and embedding.any(): + if embedding is not None and any(embedding): existing_texts.append(text) existing_embeddings.append(embedding) existing_ids.append(id_) diff --git a/js/global_scope_js.js b/js/global_scope_js.js index f308edb9..e808c473 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -31,24 +31,94 @@ function removeLastClick() { } function handleMorphdomUpdate(text) { + // Track closed blocks + const closedBlocks = new Set(); + document.querySelectorAll(".thinking-block").forEach(block => { + const blockId = block.getAttribute("data-block-id"); + // If block exists and is not open, add to closed set + if (blockId && !block.hasAttribute("open")) { + closedBlocks.add(blockId); + } + }); + + // Store scroll positions for any open blocks + const scrollPositions = {}; + document.querySelectorAll(".thinking-block[open]").forEach(block => { + const content = block.querySelector(".thinking-content"); + const blockId = block.getAttribute("data-block-id"); + if (content && blockId) { + const isAtBottom = Math.abs((content.scrollHeight - content.scrollTop) - content.clientHeight) < 5; + scrollPositions[blockId] = { + position: content.scrollTop, + isAtBottom: isAtBottom + }; + } + }); + morphdom( document.getElementById("chat").parentNode, "
" + text + "
", { onBeforeElUpdated: function(fromEl, toEl) { + // Preserve code highlighting if (fromEl.tagName === "PRE" && fromEl.querySelector("code[data-highlighted]")) { const fromCode = fromEl.querySelector("code"); const toCode = toEl.querySelector("code"); if (fromCode && toCode && fromCode.textContent === toCode.textContent) { - // If the content is the same, preserve the entire
 element
             toEl.className = fromEl.className;
             toEl.innerHTML = fromEl.innerHTML;
-            return false; // Skip updating the 
 element
+            return false;
+          }
+        }
+
+        // For thinking blocks, respect closed state
+        if (fromEl.classList && fromEl.classList.contains("thinking-block") &&
+            toEl.classList && toEl.classList.contains("thinking-block")) {
+          const blockId = toEl.getAttribute("data-block-id");
+          // If this block was closed by user, keep it closed
+          if (blockId && closedBlocks.has(blockId)) {
+            toEl.removeAttribute("open");
+          }
+        }
+
+        return !fromEl.isEqualNode(toEl);
+      },
+
+      onElUpdated: function(el) {
+        // Restore scroll positions for open thinking blocks
+        if (el.classList && el.classList.contains("thinking-block") && el.hasAttribute("open")) {
+          const blockId = el.getAttribute("data-block-id");
+          const content = el.querySelector(".thinking-content");
+
+          if (content && blockId && scrollPositions[blockId]) {
+            setTimeout(() => {
+              if (scrollPositions[blockId].isAtBottom) {
+                content.scrollTop = content.scrollHeight;
+              } else {
+                content.scrollTop = scrollPositions[blockId].position;
+              }
+            }, 0);
           }
         }
-        return !fromEl.isEqualNode(toEl); // Update only if nodes differ
       }
     }
   );
+
+  // Add toggle listeners for new blocks
+  document.querySelectorAll(".thinking-block").forEach(block => {
+    if (!block._hasToggleListener) {
+      block.addEventListener("toggle", function(e) {
+        if (this.open) {
+          const content = this.querySelector(".thinking-content");
+          if (content) {
+            setTimeout(() => {
+              content.scrollTop = content.scrollHeight;
+            }, 0);
+          }
+        }
+      });
+      block._hasToggleListener = true;
+    }
+  });
 }
diff --git a/js/main.js b/js/main.js
index c5c47d04..33b7d6bd 100644
--- a/js/main.js
+++ b/js/main.js
@@ -395,7 +395,7 @@ let bigPictureVisible = false;
 function addBigPicture() {
   var imgElement = document.createElement("img");
   var timestamp = new Date().getTime();
-  imgElement.src = "/file/cache/pfp_character.png?time=" + timestamp;
+  imgElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
   imgElement.classList.add("bigProfilePicture");
   imgElement.addEventListener("load", function () {
     this.style.visibility = "visible";
diff --git a/js/update_big_picture.js b/js/update_big_picture.js
index 4c094776..ec51d63b 100644
--- a/js/update_big_picture.js
+++ b/js/update_big_picture.js
@@ -2,6 +2,6 @@ function updateBigPicture() {
   var existingElement = document.querySelector(".bigProfilePicture");
   if (existingElement) {
     var timestamp = new Date().getTime();
-    existingElement.src = "/file/cache/pfp_character.png?time=" + timestamp;
+    existingElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
   }
 }
diff --git a/modules/chat.py b/modules/chat.py
index fd949907..e117e6ee 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -417,16 +417,8 @@ def generate_chat_reply(text, state, regenerate=False, _continue=False, loading_
             yield history
             return
 
-    show_after = html.escape(state.get("show_after")) if state.get("show_after") else None
     for history in chatbot_wrapper(text, state, regenerate=regenerate, _continue=_continue, loading_message=loading_message, for_ui=for_ui):
-        if show_after:
-            after = history["visible"][-1][1].partition(show_after)[2] or "*Is thinking...*"
-            yield {
-                'internal': history['internal'],
-                'visible': history['visible'][:-1] + [[history['visible'][-1][0], after]]
-            }
-        else:
-            yield history
+        yield history
 
 
 def character_is_loaded(state, raise_exception=False):
@@ -533,9 +525,9 @@ def start_new_chat(state):
 
 def get_history_file_path(unique_id, character, mode):
     if mode == 'instruct':
-        p = Path(f'logs/instruct/{unique_id}.json')
+        p = Path(f'user_data/logs/instruct/{unique_id}.json')
     else:
-        p = Path(f'logs/chat/{character}/{unique_id}.json')
+        p = Path(f'user_data/logs/chat/{character}/{unique_id}.json')
 
     return p
 
@@ -571,13 +563,13 @@ def rename_history(old_id, new_id, character, mode):
 
 def get_paths(state):
     if state['mode'] == 'instruct':
-        return Path('logs/instruct').glob('*.json')
+        return Path('user_data/logs/instruct').glob('*.json')
     else:
         character = state['character_menu']
 
         # Handle obsolete filenames and paths
-        old_p = Path(f'logs/{character}_persistent.json')
-        new_p = Path(f'logs/persistent_{character}.json')
+        old_p = Path(f'user_data/logs/{character}_persistent.json')
+        new_p = Path(f'user_data/logs/persistent_{character}.json')
         if old_p.exists():
             logger.warning(f"Renaming \"{old_p}\" to \"{new_p}\"")
             old_p.rename(new_p)
@@ -589,7 +581,7 @@ def get_paths(state):
             p.parent.mkdir(exist_ok=True)
             new_p.rename(p)
 
-        return Path(f'logs/chat/{character}').glob('*.json')
+        return Path(f'user_data/logs/chat/{character}').glob('*.json')
 
 
 def find_all_histories(state):
@@ -740,7 +732,7 @@ def generate_pfp_cache(character):
     if not cache_folder.exists():
         cache_folder.mkdir()
 
-    for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
+    for path in [Path(f"user_data/characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
         if path.exists():
             original_img = Image.open(path)
             original_img.save(Path(f'{cache_folder}/pfp_character.png'), format='PNG')
@@ -760,12 +752,12 @@ def load_character(character, name1, name2):
 
     filepath = None
     for extension in ["yml", "yaml", "json"]:
-        filepath = Path(f'characters/{character}.{extension}')
+        filepath = Path(f'user_data/characters/{character}.{extension}')
         if filepath.exists():
             break
 
     if filepath is None or not filepath.exists():
-        logger.error(f"Could not find the character \"{character}\" inside characters/. No character has been loaded.")
+        logger.error(f"Could not find the character \"{character}\" inside user_data/characters. No character has been loaded.")
         raise ValueError
 
     file_contents = open(filepath, 'r', encoding='utf-8').read()
@@ -804,7 +796,7 @@ def load_instruction_template(template):
     if template == 'None':
         return ''
 
-    for filepath in [Path(f'instruction-templates/{template}.yaml'), Path('instruction-templates/Alpaca.yaml')]:
+    for filepath in [Path(f'user_data/instruction-templates/{template}.yaml'), Path('user_data/instruction-templates/Alpaca.yaml')]:
         if filepath.exists():
             break
     else:
@@ -846,17 +838,17 @@ def upload_character(file, img, tavern=False):
 
     outfile_name = name
     i = 1
-    while Path(f'characters/{outfile_name}.yaml').exists():
+    while Path(f'user_data/characters/{outfile_name}.yaml').exists():
         outfile_name = f'{name}_{i:03d}'
         i += 1
 
-    with open(Path(f'characters/{outfile_name}.yaml'), 'w', encoding='utf-8') as f:
+    with open(Path(f'user_data/characters/{outfile_name}.yaml'), 'w', encoding='utf-8') as f:
         f.write(yaml_data)
 
     if img is not None:
-        img.save(Path(f'characters/{outfile_name}.png'))
+        img.save(Path(f'user_data/characters/{outfile_name}.png'))
 
-    logger.info(f'New character saved to "characters/{outfile_name}.yaml".')
+    logger.info(f'New character saved to "user_data/characters/{outfile_name}.yaml".')
     return gr.update(value=outfile_name, choices=get_available_characters())
 
 
@@ -931,9 +923,9 @@ def save_character(name, greeting, context, picture, filename):
         return
 
     data = generate_character_yaml(name, greeting, context)
-    filepath = Path(f'characters/{filename}.yaml')
+    filepath = Path(f'user_data/characters/{filename}.yaml')
     save_file(filepath, data)
-    path_to_img = Path(f'characters/{filename}.png')
+    path_to_img = Path(f'user_data/characters/{filename}.png')
     if picture is not None:
         picture.save(path_to_img)
         logger.info(f'Saved {path_to_img}.')
@@ -941,9 +933,9 @@ def save_character(name, greeting, context, picture, filename):
 
 def delete_character(name, instruct=False):
     for extension in ["yml", "yaml", "json"]:
-        delete_file(Path(f'characters/{name}.{extension}'))
+        delete_file(Path(f'user_data/characters/{name}.{extension}'))
 
-    delete_file(Path(f'characters/{name}.png'))
+    delete_file(Path(f'user_data/characters/{name}.png'))
 
 
 def jinja_template_from_old_format(params, verbose=False):
@@ -1246,7 +1238,7 @@ def handle_save_template_click(instruction_template_str):
     contents = generate_instruction_template_yaml(instruction_template_str)
     return [
         "My Template.yaml",
-        "instruction-templates/",
+        "user_data/instruction-templates/",
         contents,
         gr.update(visible=True)
     ]
@@ -1255,7 +1247,7 @@ def handle_save_template_click(instruction_template_str):
 def handle_delete_template_click(template):
     return [
         f"{template}.yaml",
-        "instruction-templates/",
+        "user_data/instruction-templates/",
         gr.update(visible=False)
     ]
 
diff --git a/modules/evaluate.py b/modules/evaluate.py
index ba0de378..4f41c1fc 100644
--- a/modules/evaluate.py
+++ b/modules/evaluate.py
@@ -12,8 +12,8 @@ from modules.text_generation import encode
 
 
 def load_past_evaluations():
-    if Path('logs/evaluations.csv').exists():
-        df = pd.read_csv(Path('logs/evaluations.csv'), dtype=str)
+    if Path('user_data/logs/evaluations.csv').exists():
+        df = pd.read_csv(Path('user_data/logs/evaluations.csv'), dtype=str)
         df['Perplexity'] = pd.to_numeric(df['Perplexity'])
         return df
     else:
@@ -26,7 +26,7 @@ past_evaluations = load_past_evaluations()
 def save_past_evaluations(df):
     global past_evaluations
     past_evaluations = df
-    filepath = Path('logs/evaluations.csv')
+    filepath = Path('user_data/logs/evaluations.csv')
     filepath.parent.mkdir(parents=True, exist_ok=True)
     df.to_csv(filepath, index=False)
 
@@ -69,7 +69,7 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
         data = load_dataset('ptb_text_only', 'penn_treebank', split='test')
         text = " ".join(data['sentence'])
     else:
-        with open(Path(f'training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f:
+        with open(Path(f'user_data/training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f:
             text = f.read()
 
     for model in models:
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 0289bb21..6bb422ea 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -40,7 +40,7 @@ class Exllamav2Model:
         config.model_dir = str(path_to_model)
         config.prepare()
 
-        config.max_seq_len = shared.args.max_seq_len
+        config.max_seq_len = shared.args.ctx_size
         config.scale_pos_emb = shared.args.compress_pos_emb
         config.scale_alpha_value = shared.args.alpha_value
         config.no_flash_attn = shared.args.no_flash_attn
@@ -85,7 +85,44 @@ class Exllamav2Model:
             model.load_autosplit(cache)
 
         tokenizer = ExLlamaV2Tokenizer(config)
-        generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
+
+        # Initialize draft model for speculative decoding
+        draft_model = None
+        draft_cache = None
+
+        if shared.args.model_draft and shared.args.model_draft.lower() not in ["none", ""]:
+            logger.info(f"Loading draft model for speculative decoding: {shared.args.model_draft}")
+
+            # Find the draft model path
+            draft_path = Path(shared.args.model_draft)
+            if not draft_path.exists():
+                draft_path = Path(f'{shared.args.model_dir}') / Path(shared.args.model_draft)
+
+            draft_config = ExLlamaV2Config()
+            draft_config.model_dir = str(draft_path)
+            draft_config.prepare()
+            draft_config.arch_compat_overrides()
+
+            # Set context size for draft model
+            if shared.args.ctx_size_draft > 0:
+                draft_config.max_seq_len = shared.args.ctx_size_draft
+            else:
+                draft_config.max_seq_len = config.max_seq_len
+
+            draft_model = ExLlamaV2(draft_config)
+            draft_cache = cache_type(draft_model, lazy=True)
+            draft_model.load_autosplit(draft_cache)
+
+            logger.info(f"Draft model loaded successfully with max_draft={shared.args.draft_max}")
+
+        generator = ExLlamaV2StreamingGenerator(
+            model,
+            cache,
+            tokenizer,
+            draft_model=draft_model,
+            draft_cache=draft_cache,
+            num_speculative_tokens=shared.args.draft_max if draft_model is not None else 0
+        )
 
         result = self()
         result.model = model
@@ -93,6 +130,8 @@ class Exllamav2Model:
         result.tokenizer = tokenizer
         result.generator = generator
         result.loras = None
+        result.draft_model = draft_model
+        result.draft_cache = draft_cache
         return result, result
 
     def encode(self, string, **kwargs):
@@ -179,6 +218,10 @@ class Exllamav2Model:
         else:
             max_new_tokens = state['max_new_tokens']
 
+        # Reset speculative decoding stats if using a draft model
+        if hasattr(self, 'draft_model') and self.draft_model is not None:
+            self.generator.reset_sd_stats()
+
         self.generator.begin_stream(ids, settings, loras=self.loras)
 
         decoded_text = ''
@@ -190,6 +233,11 @@ class Exllamav2Model:
             decoded_text += chunk
             yield decoded_text
 
+        # Log speculative decoding stats if using draft model
+        if hasattr(self, 'draft_model') and self.draft_model is not None:
+            efficiency, accuracy, total_tokens, total_draft_tokens, accepted_draft_tokens = self.generator.get_sd_stats()
+            logger.info(f"Speculative decoding: accepted={accepted_draft_tokens}/{total_draft_tokens} tokens")
+
     def generate(self, prompt, state):
         output = ''
         for output in self.generate_with_streaming(prompt, state):
diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
index b159d9ce..eb801940 100644
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@@ -192,7 +192,7 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin):
         config.model_dir = str(pretrained_model_name_or_path)
         config.prepare()
 
-        config.max_seq_len = shared.args.max_seq_len
+        config.max_seq_len = shared.args.ctx_size
         config.scale_pos_emb = shared.args.compress_pos_emb
         config.scale_alpha_value = shared.args.alpha_value
         config.no_flash_attn = shared.args.no_flash_attn
diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index 2d9c493a..f15fc0b2 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Union
 
 import torch
 from exllamav3 import Cache, Config, Model
+from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
 from torch.nn import CrossEntropyLoss
 from transformers import (
     GenerationConfig,
@@ -33,13 +34,39 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
         self.ex_model = Model.from_config(config)
 
         # Calculate the closest multiple of 256 at or above the chosen value
-        max_tokens = shared.args.max_seq_len
+        max_tokens = shared.args.ctx_size
         if max_tokens % 256 != 0:
             adjusted_tokens = ((max_tokens // 256) + 1) * 256
             logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}")
             max_tokens = adjusted_tokens
 
-        self.ex_cache = Cache(self.ex_model, max_num_tokens=max_tokens)
+        # Parse cache type
+        cache_type = shared.args.cache_type.lower()
+        cache_kwargs = {}
+        if cache_type == 'fp16':
+            layer_type = CacheLayer_fp16
+        elif cache_type.startswith('q'):
+            layer_type = CacheLayer_quant
+            if '_' in cache_type:
+                # Different bits for k and v (e.g., q4_q8)
+                k_part, v_part = cache_type.split('_')
+                k_bits = int(k_part[1:])
+                v_bits = int(v_part[1:])
+            else:
+                # Same bits for k and v (e.g., q4)
+                k_bits = v_bits = int(cache_type[1:])
+
+            # Validate bit ranges
+            if not (2 <= k_bits <= 8 and 2 <= v_bits <= 8):
+                logger.warning(f"Invalid quantization bits: k_bits={k_bits}, v_bits={v_bits}. Must be between 2 and 8. Falling back to fp16.")
+                layer_type = CacheLayer_fp16
+            else:
+                cache_kwargs = {'k_bits': k_bits, 'v_bits': v_bits}
+        else:
+            logger.warning(f"Unrecognized cache type: {cache_type}. Falling back to fp16.")
+            layer_type = CacheLayer_fp16
+
+        self.ex_cache = Cache(self.ex_model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs)
 
         # Create load parameters dictionary
         load_params = {'progressbar': True}
diff --git a/modules/gradio_hijack.py b/modules/gradio_hijack.py
index 2ddd983a..817da40c 100644
--- a/modules/gradio_hijack.py
+++ b/modules/gradio_hijack.py
@@ -1,5 +1,6 @@
 '''
-Copied from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14184
+Most of the code here was adapted from:
+https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14184
 '''
 
 import inspect
@@ -7,6 +8,30 @@ import warnings
 from functools import wraps
 
 import gradio as gr
+import gradio.routes
+import gradio.utils
+from starlette.middleware.trustedhost import TrustedHostMiddleware
+
+from modules import shared
+
+orig_create_app = gradio.routes.App.create_app
+
+
+# Be strict about only approving access to localhost by default
+def create_app_with_trustedhost(*args, **kwargs):
+    app = orig_create_app(*args, **kwargs)
+
+    if not (shared.args.listen or shared.args.share):
+        app.add_middleware(
+            TrustedHostMiddleware,
+            allowed_hosts=["localhost", "127.0.0.1"]
+        )
+
+    return app
+
+
+gradio.routes.App.create_app = create_app_with_trustedhost
+gradio.utils.launch_counter = lambda: None
 
 
 class GradioDeprecationWarning(DeprecationWarning):
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 144f2593..c5252c26 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -1,3 +1,4 @@
+import datetime
 import functools
 import html
 import os
@@ -106,8 +107,87 @@ def replace_blockquote(m):
     return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
 
 
+def extract_thinking_block(string):
+    """Extract thinking blocks from the beginning of a string."""
+    if not string:
+        return None, string
+
+    THINK_START_TAG = "<think>"
+    THINK_END_TAG = "</think>"
+
+    # Look for opening tag
+    start_pos = string.lstrip().find(THINK_START_TAG)
+    if start_pos == -1:
+        return None, string
+
+    # Adjust start position to account for any leading whitespace
+    start_pos = string.find(THINK_START_TAG)
+
+    # Find the content after the opening tag
+    content_start = start_pos + len(THINK_START_TAG)
+
+    # Look for closing tag
+    end_pos = string.find(THINK_END_TAG, content_start)
+
+    if end_pos != -1:
+        # Both tags found - extract content between them
+        thinking_content = string[content_start:end_pos]
+        remaining_content = string[end_pos + len(THINK_END_TAG):]
+        return thinking_content, remaining_content
+    else:
+        # Only opening tag found - everything else is thinking content
+        thinking_content = string[content_start:]
+        return thinking_content, ""
+
+
 @functools.lru_cache(maxsize=None)
-def convert_to_markdown(string):
+def convert_to_markdown(string, message_id=None):
+    if not string:
+        return ""
+
+    # Use a default message ID if none provided
+    if message_id is None:
+        message_id = "unknown"
+
+    # Extract thinking block if present
+    thinking_content, remaining_content = extract_thinking_block(string)
+
+    # Process the main content
+    html_output = process_markdown_content(remaining_content)
+
+    # If thinking content was found, process it using the same function
+    if thinking_content is not None:
+        thinking_html = process_markdown_content(thinking_content)
+
+        # Generate unique ID for the thinking block
+        block_id = f"thinking-{message_id}-0"
+
+        # Check if thinking is complete or still in progress
+        is_streaming = not remaining_content
+        title_text = "Thinking..." if is_streaming else "Thought"
+
+        thinking_block = f'''
+        
+ + + + + + + {title_text} + +
{thinking_html}
+
+ ''' + + # Prepend the thinking block to the message HTML + html_output = thinking_block + html_output + + return html_output + + +def process_markdown_content(string): + """Process a string through the markdown conversion pipeline.""" if not string: return "" @@ -208,15 +288,15 @@ def convert_to_markdown(string): return html_output -def convert_to_markdown_wrapped(string, use_cache=True): +def convert_to_markdown_wrapped(string, message_id=None, use_cache=True): ''' Used to avoid caching convert_to_markdown calls during streaming. ''' if use_cache: - return convert_to_markdown(string) + return convert_to_markdown(string, message_id=message_id) - return convert_to_markdown.__wrapped__(string) + return convert_to_markdown.__wrapped__(string, message_id=message_id) def generate_basic_html(string): @@ -272,7 +352,7 @@ def generate_instruct_html(history): for i in range(len(history['visible'])): row_visible = history['visible'][i] row_internal = history['internal'][i] - converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible] + converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] if converted_visible[0]: # Don't display empty user messages output += ( @@ -307,19 +387,19 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= # We use ?character and ?time.time() to force the browser to reset caches img_bot = ( - f'' - if Path("cache/pfp_character_thumb.png").exists() else '' + f'' + if Path("user_data/cache/pfp_character_thumb.png").exists() else '' ) img_me = ( - f'' - if Path("cache/pfp_me.png").exists() else '' + f'' + if Path("user_data/cache/pfp_me.png").exists() else '' ) for i in range(len(history['visible'])): row_visible = history['visible'][i] row_internal = history['internal'][i] - converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible] + converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] if converted_visible[0]: # Don't display empty user messages output += ( @@ -359,7 +439,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False): for i in range(len(history['visible'])): row_visible = history['visible'][i] row_internal = history['internal'][i] - converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible] + converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] if converted_visible[0]: # Don't display empty user messages output += ( @@ -389,8 +469,21 @@ def generate_chat_html(history, name1, name2, reset_cache=False): return output +def time_greeting(): + current_hour = datetime.datetime.now().hour + if 5 <= current_hour < 12: + return "Good morning!" + elif 12 <= current_hour < 18: + return "Good afternoon!" + else: + return "Good evening!" + + def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False): - if mode == 'instruct': + if len(history['visible']) == 0: + greeting = f"
{time_greeting()} How can I help you today?
" + result = f'
{greeting}
' + elif mode == 'instruct': result = generate_instruct_html(history) elif style == 'wpp': result = generate_chat_html(history, name1, name2) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index c88f945d..9572d5aa 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -1,11 +1,13 @@ import json import os import pprint +import re import socket import subprocess import sys import threading import time +from pathlib import Path import llama_cpp_binaries import requests @@ -251,7 +253,7 @@ class LlamaServer: cmd = [ self.server_path, "--model", self.model_path, - "--ctx-size", str(shared.args.n_ctx), + "--ctx-size", str(shared.args.ctx_size), "--n-gpu-layers", str(shared.args.n_gpu_layers), "--batch-size", str(shared.args.batch_size), "--port", str(self.port), @@ -281,6 +283,41 @@ class LlamaServer: cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)] if shared.args.rope_freq_base > 0: cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)] + if shared.args.model_draft not in [None, 'None']: + path = Path(shared.args.model_draft) + if not path.exists(): + path = Path(f'{shared.args.model_dir}/{shared.args.model_draft}') + + if path.is_file(): + model_file = path + else: + model_file = sorted(Path(f'{shared.args.model_dir}/{shared.args.model_draft}').glob('*.gguf'))[0] + + cmd += ["--model-draft", model_file] + if shared.args.draft_max > 0: + cmd += ["--draft-max", str(shared.args.draft_max)] + if shared.args.gpu_layers_draft > 0: + cmd += ["--gpu-layers-draft", str(shared.args.gpu_layers_draft)] + if shared.args.device_draft: + cmd += ["--device-draft", shared.args.device_draft] + if shared.args.ctx_size_draft > 0: + cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)] + if shared.args.streaming_llm: + cmd += ["--cache-reuse", "1"] + if shared.args.extra_flags: + # Clean up the input + extra_flags = shared.args.extra_flags.strip() + if extra_flags.startswith('"') and extra_flags.endswith('"'): + extra_flags = extra_flags[1:-1].strip() + elif extra_flags.startswith("'") and extra_flags.endswith("'"): + extra_flags = extra_flags[1:-1].strip() + + for flag_item in extra_flags.split(','): + if '=' in flag_item: + flag, value = flag_item.split('=', 1) + cmd += [f"--{flag}", value] + else: + cmd.append(f"--{flag_item}") env = os.environ.copy() if os.name == 'posix': @@ -299,17 +336,7 @@ class LlamaServer: env=env ) - def filter_stderr(process_stderr): - try: - for line in iter(process_stderr.readline, ''): - if not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line: - sys.stderr.write(line) - sys.stderr.flush() - except (ValueError, IOError): - # Handle pipe closed exceptions - pass - - threading.Thread(target=filter_stderr, args=(self.process.stderr,), daemon=True).start() + threading.Thread(target=filter_stderr_with_progress, args=(self.process.stderr,), daemon=True).start() # Wait for server to be healthy health_url = f"http://127.0.0.1:{self.port}/health" @@ -360,3 +387,18 @@ class LlamaServer: self.process.kill() self.process = None + + +def filter_stderr_with_progress(process_stderr): + progress_pattern = re.compile(r'slot update_slots: id.*progress = (\d+\.\d+)') + try: + for line in iter(process_stderr.readline, ''): + progress_match = progress_pattern.search(line) + if progress_match: + sys.stderr.write(line) + sys.stderr.flush() + elif not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line: + sys.stderr.write(line) + sys.stderr.flush() + except (ValueError, IOError): + pass diff --git a/modules/loaders.py b/modules/loaders.py index 7d6afe80..b8ae82d7 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -9,9 +9,11 @@ loaders_and_params = OrderedDict({ 'threads', 'threads_batch', 'batch_size', - 'n_ctx', + 'ctx_size', 'cache_type', 'tensor_split', + 'extra_flags', + 'streaming_llm', 'rope_freq_base', 'compress_pos_emb', 'flash_attn', @@ -20,6 +22,12 @@ loaders_and_params = OrderedDict({ 'no_mmap', 'mlock', 'numa', + 'model_draft', + 'draft_max', + 'gpu_layers_draft', + 'device_draft', + 'ctx_size_draft', + 'speculative_decoding_accordion', ], 'Transformers': [ 'gpu_split', @@ -41,14 +49,15 @@ loaders_and_params = OrderedDict({ 'no_use_fast', ], 'ExLlamav3_HF': [ - 'max_seq_len', + 'ctx_size', + 'cache_type', 'gpu_split', 'cfg_cache', 'trust_remote_code', 'no_use_fast', ], 'ExLlamav2_HF': [ - 'max_seq_len', + 'ctx_size', 'cache_type', 'gpu_split', 'alpha_value', @@ -64,7 +73,7 @@ loaders_and_params = OrderedDict({ 'no_use_fast', ], 'ExLlamav2': [ - 'max_seq_len', + 'ctx_size', 'cache_type', 'gpu_split', 'alpha_value', @@ -76,6 +85,10 @@ loaders_and_params = OrderedDict({ 'no_xformers', 'no_sdpa', 'exllamav2_info', + 'model_draft', + 'draft_max', + 'ctx_size_draft', + 'speculative_decoding_accordion', ], 'HQQ': [ 'hqq_backend', @@ -83,7 +96,7 @@ loaders_and_params = OrderedDict({ 'no_use_fast', ], 'TensorRT-LLM': [ - 'max_seq_len', + 'ctx_size', 'cpp_runner', 'tensorrt_llm_info', ] diff --git a/modules/models.py b/modules/models.py index 99b068aa..d0b0402a 100644 --- a/modules/models.py +++ b/modules/models.py @@ -52,10 +52,8 @@ def load_model(model_name, loader=None): tokenizer = load_tokenizer(model_name) shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) - if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt'): - shared.settings['truncation_length'] = shared.args.max_seq_len - elif loader == 'llama.cpp': - shared.settings['truncation_length'] = shared.args.n_ctx + if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp': + shared.settings['truncation_length'] = shared.args.ctx_size logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.") logger.info(f"LOADER: \"{loader}\"") diff --git a/modules/models_settings.py b/modules/models_settings.py index ee2ed71b..ae589bb3 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -11,8 +11,7 @@ def get_fallback_settings(): return { 'bf16': False, 'use_eager_attention': False, - 'max_seq_len': 2048, - 'n_ctx': 2048, + 'ctx_size': 2048, 'rope_freq_base': 0, 'compress_pos_emb': 1, 'alpha_value': 1, @@ -26,7 +25,7 @@ def get_fallback_settings(): def get_model_metadata(model): model_settings = {} - # Get settings from models/config.yaml and models/config-user.yaml + # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml settings = shared.model_config for pat in settings: if re.match(pat.lower(), Path(model).name.lower()): @@ -59,7 +58,7 @@ def get_model_metadata(model): for k in metadata: if k.endswith('context_length'): - model_settings['n_ctx'] = min(metadata[k], 8192) + model_settings['ctx_size'] = min(metadata[k], 8192) model_settings['truncation_length_info'] = metadata[k] elif k.endswith('rope.freq_base'): model_settings['rope_freq_base'] = metadata[k] @@ -97,7 +96,7 @@ def get_model_metadata(model): if k in metadata: model_settings['truncation_length'] = metadata[k] model_settings['truncation_length_info'] = metadata[k] - model_settings['max_seq_len'] = min(metadata[k], 8192) + model_settings['ctx_size'] = min(metadata[k], 8192) if 'rope_theta' in metadata: model_settings['rope_freq_base'] = metadata['rope_theta'] @@ -145,7 +144,7 @@ def get_model_metadata(model): if 'rope_freq_base' in model_settings and model_settings['rope_freq_base'] == 10000: model_settings.pop('rope_freq_base') - # Apply user settings from models/config-user.yaml + # Apply user settings from user_data/models/config-user.yaml settings = shared.user_config for pat in settings: if re.match(pat.lower(), Path(model).name.lower()): @@ -224,7 +223,7 @@ def apply_model_settings_to_state(model, state): def save_model_settings(model, state): ''' - Save the settings for this model to models/config-user.yaml + Save the settings for this model to user_data/models/config-user.yaml ''' if model == 'None': yield ("Not saving the settings because no model is selected in the menu.") diff --git a/modules/one_click_installer_check.py b/modules/one_click_installer_check.py deleted file mode 100644 index 4bde8600..00000000 --- a/modules/one_click_installer_check.py +++ /dev/null @@ -1,9 +0,0 @@ -from pathlib import Path - -from modules.logging_colors import logger - -if Path('../webui.py').exists(): - logger.warning('\nIt looks like you are running an outdated version of ' - 'the one-click-installers.\n' - 'Please migrate your installation following the instructions here:\n' - 'https://github.com/oobabooga/text-generation-webui/wiki/Migrating-an-old-one%E2%80%90click-install') diff --git a/modules/presets.py b/modules/presets.py index 7cab2af0..a432bf52 100644 --- a/modules/presets.py +++ b/modules/presets.py @@ -58,7 +58,7 @@ def presets_params(): def load_preset(name, verbose=False): generate_params = default_preset() if name not in ['None', None, '']: - path = Path(f'presets/{name}.yaml') + path = Path(f'user_data/presets/{name}.yaml') if path.exists(): with open(path, 'r') as infile: preset = yaml.safe_load(infile) diff --git a/modules/prompts.py b/modules/prompts.py index 565c2450..8f00cac2 100644 --- a/modules/prompts.py +++ b/modules/prompts.py @@ -7,7 +7,7 @@ def load_prompt(fname): if fname in ['None', '']: return '' else: - file_path = Path(f'prompts/{fname}.txt') + file_path = Path(f'user_data/prompts/{fname}.txt') if not file_path.exists(): return '' diff --git a/modules/shared.py b/modules/shared.py index 08268ae0..5d9dd362 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -1,6 +1,7 @@ import argparse import copy import os +import shlex import sys from collections import OrderedDict from pathlib import Path @@ -31,7 +32,7 @@ need_restart = False settings = { 'show_controls': True, 'start_with': '', - 'mode': 'chat-instruct', + 'mode': 'instruct', 'chat_style': 'cai-chat', 'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>', 'prompt-default': 'QA', @@ -57,7 +58,6 @@ settings = { 'seed': -1, 'custom_stopping_strings': '', 'custom_token_bans': '', - 'show_after': '', 'negative_prompt': '', 'autoload_model': False, 'dark_theme': True, @@ -77,10 +77,10 @@ group.add_argument('--multi-user', action='store_true', help='Multi-user mode. C group.add_argument('--character', type=str, help='The name of the character to load in chat mode by default.') group.add_argument('--model', type=str, help='Name of the model to load by default.') group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.') -group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.') -group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.') +group.add_argument('--model-dir', type=str, default='user_data/models', help='Path to directory with all the models.') +group.add_argument('--lora-dir', type=str, default='user_data/loras', help='Path to directory with all the loras.') group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.') -group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.') +group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See user_data/settings-template.yaml for an example. If you create a file called user_data/settings.yaml, this file will be loaded by default without the need to use the --settings flag.') group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.') group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.') group.add_argument('--idle-timeout', type=int, default=0, help='Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.') @@ -94,7 +94,7 @@ group = parser.add_argument_group('Transformers/Accelerate') group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.') group.add_argument('--cpu-memory', type=float, default=0, help='Maximum CPU memory in GiB. Use this for CPU offloading.') group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.') -group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".') +group.add_argument('--disk-cache-dir', type=str, default='user_data/cache', help='Directory to save the disk cache to. Defaults to "user_data/cache".') group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).') group.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.') group.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.') @@ -115,10 +115,9 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for # llama.cpp group = parser.add_argument_group('llama.cpp') group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.') -group.add_argument('--n_ctx', type=int, default=8192, help='Size of the prompt context.') group.add_argument('--threads', type=int, default=0, help='Number of threads to use.') group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') -group.add_argument('--batch-size', type=int, default=2048, help='Maximum number of prompt tokens to batch together when calling llama_eval.') +group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') @@ -126,17 +125,31 @@ group.add_argument('--tensor-split', type=str, default=None, help='Split the mod group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.') +group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"') +group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') + +# Cache +group = parser.add_argument_group('Context and cache management') +group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.') +group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).') + +# Speculative decoding +group = parser.add_argument_group('Speculative decoding') +group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.') +group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to draft for speculative decoding.') +group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of layers to offload to the GPU for the draft model.') +group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1') +group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.') # ExLlamaV2 group = parser.add_argument_group('ExLlamaV2') group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.') group.add_argument('--autosplit', action='store_true', help='Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.') -group.add_argument('--max_seq_len', type=int, default=8192, help='Maximum sequence length.') group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.') group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.') -group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') +group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.') group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.') # HQQ @@ -192,12 +205,36 @@ group.add_argument('--nowebui', action='store_true', help='Do not launch the Gra # Deprecated parameters group = parser.add_argument_group('Deprecated') +# Handle CMD_FLAGS.txt +cmd_flags_path = Path(__file__).parent.parent / "user_data" / "CMD_FLAGS.txt" +if cmd_flags_path.exists(): + with cmd_flags_path.open('r', encoding='utf-8') as f: + cmd_flags = ' '.join( + line.strip().rstrip('\\').strip() + for line in f + if line.strip().rstrip('\\').strip() and not line.strip().startswith('#') + ) + + if cmd_flags: + # Command-line takes precedence over CMD_FLAGS.txt + sys.argv = [sys.argv[0]] + shlex.split(cmd_flags) + sys.argv[1:] + + args = parser.parse_args() args_defaults = parser.parse_args([]) + +# Create a mapping of all argument aliases to their canonical names +alias_to_dest = {} +for action in parser._actions: + for opt in action.option_strings: + alias_to_dest[opt.lstrip('-').replace('-', '_')] = action.dest + provided_arguments = [] for arg in sys.argv[1:]: arg = arg.lstrip('-').replace('-', '_') - if hasattr(args, arg): + if arg in alias_to_dest: + provided_arguments.append(alias_to_dest[arg]) + elif hasattr(args, arg): provided_arguments.append(arg) diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py index c2685b75..73178c39 100644 --- a/modules/tensorrt_llm.py +++ b/modules/tensorrt_llm.py @@ -1,15 +1,15 @@ from pathlib import Path -import tensorrt_llm import torch -from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp +import tensorrt_llm from modules import shared from modules.logging_colors import logger from modules.text_generation import ( get_max_prompt_length, get_reply_from_output_ids ) +from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp class TensorRTLLMModel: @@ -35,7 +35,7 @@ class TensorRTLLMModel: logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"") runner_kwargs.update( max_batch_size=1, - max_input_len=shared.args.max_seq_len - 512, + max_input_len=shared.args.ctx_size - 512, max_output_len=512, max_beam_width=1, max_attention_window_size=None, diff --git a/modules/text_generation.py b/modules/text_generation.py index 40046eb2..4e3d1d7a 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -264,6 +264,11 @@ def apply_stopping_strings(reply, all_stop_strings): def get_reply_from_output_ids(output_ids, state=None, starting_from=0): + import torch + + if torch.cuda.is_available(): + torch.cuda.synchronize() + reply = decode(output_ids[starting_from:], state['skip_special_tokens'] if state else True) # Handle tokenizers that do not add the leading space for the first token diff --git a/modules/training.py b/modules/training.py index c6c380a3..2354c39d 100644 --- a/modules/training.py +++ b/modules/training.py @@ -52,7 +52,7 @@ def create_ui(): with gr.Column(): always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).', elem_classes=['no-background']) - with gr.Accordion(label='Target Modules', open=False): + with gr.Accordion(label='Target Modules', open=False, elem_classes='tgw-accordion'): gr.Markdown("Selects which modules to target in training. Targeting more modules is closer to a full fine-tune at the cost of increased VRAM requirements and adapter size.\nNOTE: Only works for model_id='llama', other types will retain default training behavior and not use these settings.") with gr.Row(): with gr.Column(): @@ -86,7 +86,7 @@ def create_ui(): with gr.Row(): lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.', elem_classes=['slim-dropdown']) - with gr.Accordion(label='Advanced Options', open=False): + with gr.Accordion(label='Advanced Options', open=False, elem_classes='tgw-accordion'): with gr.Row(): with gr.Column(): lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.') @@ -106,23 +106,23 @@ def create_ui(): with gr.Column(): with gr.Tab(label='Formatted Dataset'): with gr.Row(): - format = gr.Dropdown(choices=utils.get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'], interactive=not mu) - ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('training/formats', 'json')}, 'refresh-button', interactive=not mu) + format = gr.Dropdown(choices=utils.get_datasets('user_data/training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'], interactive=not mu) + ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/formats', 'json')}, 'refresh-button', interactive=not mu) with gr.Row(): - dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu) - ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button', interactive=not mu) + dataset = gr.Dropdown(choices=utils.get_datasets('user_data/training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu) + ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/datasets', 'json')}, 'refresh-button', interactive=not mu) with gr.Row(): - eval_dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'], interactive=not mu) - ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button', interactive=not mu) + eval_dataset = gr.Dropdown(choices=utils.get_datasets('user_data/training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'], interactive=not mu) + ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/datasets', 'json')}, 'refresh-button', interactive=not mu) eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.') with gr.Tab(label="Raw text file"): with gr.Row(): - raw_text_file = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu) - ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'txt')}, 'refresh-button', interactive=not mu) + raw_text_file = gr.Dropdown(choices=utils.get_datasets('user_data/training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu) + ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/datasets', 'txt')}, 'refresh-button', interactive=not mu) with gr.Row(): with gr.Column(): @@ -143,7 +143,7 @@ def create_ui(): with gr.Row(): with gr.Column(): models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True, interactive=not mu) - evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.', interactive=not mu) + evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('user_data/training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under user_data/training/datasets.', interactive=not mu) with gr.Row(): with gr.Column(): stride_length = gr.Slider(label='Stride', minimum=0, maximum=32768, value=512, step=256, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.') @@ -402,7 +402,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: if raw_text_file not in ['None', '']: train_template["template_type"] = "raw_text" logger.info("Loading raw text file dataset") - fullpath = clean_path('training/datasets', f'{raw_text_file}') + fullpath = clean_path('user_data/training/datasets', f'{raw_text_file}') fullpath = Path(fullpath) if fullpath.is_dir(): logger.info('Training path directory {}'.format(raw_text_file)) @@ -415,7 +415,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: logger.info(f"Loaded training file: {file_path.name}") else: - with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file: + with open(clean_path('user_data/training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file: raw_text = file.read().replace('\r', '') cut_string = hard_cut_string.replace('\\n', '\n') @@ -460,7 +460,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: train_template["template_type"] = "dataset" - with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile: + with open(clean_path('user_data/training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile: format_data: dict[str, str] = json.load(formatFile) # == store training prompt == @@ -482,13 +482,13 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: return tokenize(prompt, add_eos_token) logger.info("Loading JSON datasets") - data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json')) + data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json')) train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30)) if eval_dataset == 'None': eval_data = None else: - eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json')) + eval_data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{eval_dataset}.json')) eval_data = eval_data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30)) # == We MUST reload model if it went through any previous training, even failed one == @@ -676,11 +676,11 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: decoded_entries.append({"value": decoded_text}) # Write the log file - Path('logs').mkdir(exist_ok=True) - with open(Path('logs/train_dataset_sample.json'), 'w') as json_file: + Path('user_data/logs').mkdir(exist_ok=True) + with open(Path('user_data/logs/train_dataset_sample.json'), 'w') as json_file: json.dump(decoded_entries, json_file, indent=4) - logger.info("Log file 'train_dataset_sample.json' created in the 'logs' directory.") + logger.info("Log file 'train_dataset_sample.json' created in the 'user_data/logs' directory.") except Exception as e: logger.error(f"Failed to create log file due to error: {e}") diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py index add3be66..905f5c47 100644 --- a/modules/transformers_loader.py +++ b/modules/transformers_loader.py @@ -249,7 +249,7 @@ def load_model_HF(model_name): ) if shared.args.disk: - params['offload_folder'] = shared.args.disk_cache_dir + params['offload_folder'] = str(Path(shared.args.disk_cache_dir)) if shared.args.compress_pos_emb > 1: params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb} diff --git a/modules/ui.py b/modules/ui.py index d5caaeaa..f137e62d 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -94,7 +94,7 @@ if not shared.args.old_colors: input_radius='0.375rem', ) -if Path("notification.mp3").exists(): +if Path("user_data/notification.mp3").exists(): audio_notification_js = "document.querySelector('#audio_notification audio')?.play();" else: audio_notification_js = "" @@ -110,10 +110,10 @@ def list_model_elements(): 'threads_batch', 'batch_size', 'hqq_backend', - 'n_ctx', - 'max_seq_len', + 'ctx_size', 'cache_type', 'tensor_split', + 'extra_flags', 'gpu_split', 'alpha_value', 'rope_freq_base', @@ -145,6 +145,11 @@ def list_model_elements(): 'cpp_runner', 'trust_remote_code', 'no_use_fast', + 'model_draft', + 'draft_max', + 'gpu_layers_draft', + 'device_draft', + 'ctx_size_draft', ] return elements @@ -201,7 +206,6 @@ def list_interface_input_elements(): 'sampler_priority', 'custom_stopping_strings', 'custom_token_bans', - 'show_after', 'negative_prompt', 'dry_sequence_breakers', 'grammar_string', @@ -262,7 +266,7 @@ def apply_interface_values(state, use_persistent=False): if 'textbox-default' in state and 'prompt_menu-default' in state: state.pop('prompt_menu-default') - if 'textbox-notebook' and 'prompt_menu-notebook' in state: + if 'textbox-notebook' in state and 'prompt_menu-notebook' in state: state.pop('prompt_menu-notebook') elements = list_interface_input_elements() diff --git a/modules/ui_chat.py b/modules/ui_chat.py index a830abfb..0d588549 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -88,7 +88,7 @@ def create_ui(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) with gr.Row(): - shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') + shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') with gr.Row(): shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct') @@ -146,7 +146,7 @@ def create_chat_settings_ui(): with gr.Column(scale=1): shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil', interactive=not mu) - shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None, interactive=not mu) + shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('user_data/cache/pfp_me.png')) if Path('user_data/cache/pfp_me.png').exists() else None, interactive=not mu) with gr.Tab('Instruction template'): with gr.Row(): diff --git a/modules/ui_default.py b/modules/ui_default.py index ccae9a5e..c2946b37 100644 --- a/modules/ui_default.py +++ b/modules/ui_default.py @@ -102,7 +102,7 @@ def handle_save_prompt(text): return [ text, utils.current_time() + ".txt", - "prompts/", + "user_data/prompts/", gr.update(visible=True) ] @@ -110,6 +110,6 @@ def handle_save_prompt(text): def handle_delete_prompt(prompt): return [ prompt + ".txt", - "prompts/", + "user_data/prompts/", gr.update(visible=True) ] diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py index 3a27e1b9..d1f9379b 100644 --- a/modules/ui_file_saving.py +++ b/modules/ui_file_saving.py @@ -28,7 +28,7 @@ def create_ui(): # Character saver/deleter with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['character_saver']: - shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your characters/ folder with this base filename.') + shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your user_data/characters folder with this base filename.') with gr.Row(): shared.gradio['save_character_cancel'] = gr.Button('Cancel', elem_classes="small-button") shared.gradio['save_character_confirm'] = gr.Button('Save', elem_classes="small-button", variant='primary', interactive=not mu) @@ -41,7 +41,7 @@ def create_ui(): # Preset saver with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['preset_saver']: - shared.gradio['save_preset_filename'] = gr.Textbox(lines=1, label='File name', info='The preset will be saved to your presets/ folder with this base filename.') + shared.gradio['save_preset_filename'] = gr.Textbox(lines=1, label='File name', info='The preset will be saved to your user_data/presets folder with this base filename.') shared.gradio['save_preset_contents'] = gr.Textbox(lines=10, label='File contents') with gr.Row(): shared.gradio['save_preset_cancel'] = gr.Button('Cancel', elem_classes="small-button") @@ -72,7 +72,7 @@ def create_event_handlers(): def handle_save_preset_confirm_click(filename, contents): try: - utils.save_file(f"presets/{filename}.yaml", contents) + utils.save_file(f"user_data/presets/{filename}.yaml", contents) available_presets = utils.get_available_presets() output = gr.update(choices=available_presets, value=filename) except Exception: @@ -145,7 +145,7 @@ def handle_save_preset_click(state): def handle_delete_preset_click(preset): return [ f"{preset}.yaml", - "presets/", + "user_data/presets/", gr.update(visible=True) ] @@ -154,7 +154,7 @@ def handle_save_grammar_click(grammar_string): return [ grammar_string, "My Fancy Grammar.gbnf", - "grammars/", + "user_data/grammars/", gr.update(visible=True) ] @@ -162,6 +162,6 @@ def handle_save_grammar_click(grammar_string): def handle_delete_grammar_click(grammar_file): return [ grammar_file, - "grammars/", + "user_data/grammars/", gr.update(visible=True) ] diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index b4af771c..e3cf2ba6 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -51,11 +51,11 @@ def create_ui(): shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) - shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768.') - shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768.') - shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.') + shared.gradio['ctx_size'] = gr.Number(label='ctx_size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.') + shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') + shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags) shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory) shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.') shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.') @@ -70,6 +70,7 @@ def create_ui(): shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.') shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') + shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.') shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.') @@ -90,7 +91,18 @@ def create_ui(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") - shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') + shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') + + # Speculative decoding + with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']: + with gr.Row(): + shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', interactive=not mu) + ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) + + shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.') + shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.') + shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1') + shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.') with gr.Column(): with gr.Row(): @@ -211,9 +223,9 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur model_dir=shared.args.model_dir if shared.args.model_dir != shared.args_defaults.model_dir else None ) - if output_folder == Path("models"): + if output_folder == Path("user_data/models"): output_folder = Path(shared.args.model_dir) - elif output_folder == Path("loras"): + elif output_folder == Path("user_data/loras"): output_folder = Path(shared.args.lora_dir) if check: @@ -234,10 +246,8 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur def update_truncation_length(current_length, state): if 'loader' in state: - if state['loader'].lower().startswith('exllama'): - return state['max_seq_len'] - elif state['loader'] == 'llama.cpp': - return state['n_ctx'] + if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp': + return state['ctx_size'] return current_length diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index c3245a9d..6c2715af 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -93,7 +93,6 @@ def create_ui(default_preset): shared.gradio['sampler_priority'] = gr.Textbox(value=generate_params['sampler_priority'], lines=12, label='Sampler priority', info='Parameter names separated by new lines or commas.', elem_classes=['add_scrollbar']) shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=2, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"') shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Token bans', info='Token IDs to ban, separated by commas. The IDs can be found in the Default or Notebook tab.') - shared.gradio['show_after'] = gr.Textbox(value=shared.settings['show_after'] or None, label='Show after', info='Hide the reply before this text.', placeholder="") shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt', info='For CFG. Only used when guidance_scale is different than 1.', lines=3, elem_classes=['add_scrollbar']) shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=generate_params['dry_sequence_breakers'], label='dry_sequence_breakers', info='Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.') with gr.Row() as shared.gradio['grammar_file_row']: @@ -122,16 +121,14 @@ def create_event_handlers(): def get_truncation_length(): - if 'max_seq_len' in shared.provided_arguments or shared.args.max_seq_len != shared.args_defaults.max_seq_len: - return shared.args.max_seq_len - elif 'n_ctx' in shared.provided_arguments or shared.args.n_ctx != shared.args_defaults.n_ctx: - return shared.args.n_ctx + if 'ctx_size' in shared.provided_arguments or shared.args.ctx_size != shared.args_defaults.ctx_size: + return shared.args.ctx_size else: return shared.settings['truncation_length'] def load_grammar(name): - p = Path(f'grammars/{name}') + p = Path(f'user_data/grammars/{name}') if p.exists(): return open(p, 'r', encoding='utf-8').read() else: diff --git a/modules/ui_session.py b/modules/ui_session.py index 66386d12..7cf9f6e6 100644 --- a/modules/ui_session.py +++ b/modules/ui_session.py @@ -13,7 +13,7 @@ def create_ui(): shared.gradio['reset_interface'] = gr.Button("Apply flags/extensions and restart", interactive=not mu) with gr.Row(): shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡') - shared.gradio['save_settings'] = gr.Button('Save UI defaults to settings.yaml', interactive=not mu) + shared.gradio['save_settings'] = gr.Button('Save UI defaults to user_data/settings.yaml', interactive=not mu) with gr.Row(): with gr.Column(): @@ -48,7 +48,7 @@ def handle_save_settings(state, preset, extensions, show_controls, theme): return [ contents, "settings.yaml", - "./", + "user_data/", gr.update(visible=True) ] diff --git a/modules/utils.py b/modules/utils.py index f6be7541..77324139 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -76,44 +76,54 @@ def get_available_models(): # Get all GGUF files gguf_files = get_available_ggufs() + # Filter out non-first parts of multipart GGUF files + filtered_gguf_files = [] + for gguf_path in gguf_files: + filename = os.path.basename(gguf_path) + + match = re.search(r'-(\d+)-of-\d+\.gguf$', filename) + + if match: + part_number = match.group(1) + # Keep only if it's part 1 + if part_number.lstrip("0") == "1": + filtered_gguf_files.append(gguf_path) + else: + # Not a multi-part file + filtered_gguf_files.append(gguf_path) + model_dir = Path(shared.args.model_dir) # Find top-level directories containing GGUF files dirs_with_gguf = set() for gguf_path in gguf_files: path = Path(gguf_path) - if path.parts: # If in a subdirectory - dirs_with_gguf.add(path.parts[0]) # Add top-level directory + if len(path.parts) > 0: + dirs_with_gguf.add(path.parts[0]) - # Find directories with safetensors files directly under them + # Find directories with safetensors files dirs_with_safetensors = set() for item in os.listdir(model_dir): item_path = model_dir / item if item_path.is_dir(): - # Check if there are safetensors files directly under this directory if any(file.lower().endswith(('.safetensors', '.pt')) for file in os.listdir(item_path) if (item_path / file).is_file()): dirs_with_safetensors.add(item) # Find valid model directories model_dirs = [] - for item in os.listdir(model_dir): item_path = model_dir / item - - # Skip if not a directory if not item_path.is_dir(): continue - # Include directory if it either: - # 1. Doesn't contain GGUF files, OR - # 2. Contains both GGUF and safetensors files + # Include directory if it either doesn't contain GGUF files + # or contains both GGUF and safetensors files if item not in dirs_with_gguf or item in dirs_with_safetensors: model_dirs.append(item) model_dirs = sorted(model_dirs, key=natural_keys) - # Combine all models - return ['None'] + gguf_files + model_dirs + return ['None'] + filtered_gguf_files + model_dirs def get_available_ggufs(): @@ -131,11 +141,11 @@ def get_available_ggufs(): def get_available_presets(): - return sorted(set((k.stem for k in Path('presets').glob('*.yaml'))), key=natural_keys) + return sorted(set((k.stem for k in Path('user_data/presets').glob('*.yaml'))), key=natural_keys) def get_available_prompts(): - prompt_files = list(Path('prompts').glob('*.txt')) + prompt_files = list(Path('user_data/prompts').glob('*.txt')) sorted_files = sorted(prompt_files, key=lambda x: x.stat().st_mtime, reverse=True) prompts = [file.stem for file in sorted_files] prompts.append('None') @@ -143,12 +153,12 @@ def get_available_prompts(): def get_available_characters(): - paths = (x for x in Path('characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml')) + paths = (x for x in Path('user_data/characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml')) return sorted(set((k.stem for k in paths)), key=natural_keys) def get_available_instruction_templates(): - path = "instruction-templates" + path = "user_data/instruction-templates" paths = [] if os.path.exists(path): paths = (x for x in Path(path).iterdir() if x.suffix in ('.json', '.yaml', '.yml')) @@ -179,4 +189,4 @@ def get_available_chat_styles(): def get_available_grammars(): - return ['None'] + sorted([item.name for item in list(Path('grammars').glob('*.gbnf'))], key=natural_keys) + return ['None'] + sorted([item.name for item in list(Path('user_data/grammars').glob('*.gbnf'))], key=natural_keys) diff --git a/one_click.py b/one_click.py index 04b729eb..065afd99 100644 --- a/one_click.py +++ b/one_click.py @@ -28,14 +28,7 @@ conda_env_path = os.path.join(script_dir, "installer_files", "env") state_file = '.installer_state.json' # Command-line flags -cmd_flags_path = os.path.join(script_dir, "CMD_FLAGS.txt") -if os.path.exists(cmd_flags_path): - with open(cmd_flags_path, 'r') as f: - CMD_FLAGS = ' '.join(line.strip().rstrip('\\').strip() for line in f if line.strip().rstrip('\\').strip() and not line.strip().startswith('#')) -else: - CMD_FLAGS = '' - -flags = f"{' '.join([flag for flag in sys.argv[1:] if flag != '--update-wizard'])} {CMD_FLAGS}" +flags = f"{' '.join([flag for flag in sys.argv[1:] if flag != '--update-wizard'])}" def signal_handler(sig, frame): @@ -300,9 +293,10 @@ def install_webui(): # Write a flag to CMD_FLAGS.txt for CPU mode if selected_gpu == "NONE": + cmd_flags_path = os.path.join(script_dir, "user_data", "CMD_FLAGS.txt") with open(cmd_flags_path, 'r+') as cmd_flags_file: if "--cpu" not in cmd_flags_file.read(): - print_big_message("Adding the --cpu flag to CMD_FLAGS.txt.") + print_big_message("Adding the --cpu flag to user_data/CMD_FLAGS.txt.") cmd_flags_file.write("\n--cpu\n") # Handle CUDA version display @@ -538,7 +532,7 @@ if __name__ == "__main__": flags_list = re.split(' +(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)|=', flags) model_dir = [flags_list[(flags_list.index(flag) + 1)] for flag in flags_list if flag == '--model-dir'][0].strip('"\'') else: - model_dir = 'models' + model_dir = 'user_data/models' if len([item for item in glob.glob(f'{model_dir}/*') if not item.endswith(('.txt', '.yaml'))]) == 0: print_big_message("You haven't downloaded any model yet.\nOnce the web UI launches, head over to the \"Model\" tab and download one.") diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index b9afaa07..c20c161e 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -30,12 +30,12 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 96cb299d..437da5b5 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -29,6 +29,6 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 0f1a4fc2..b1c87990 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -29,6 +29,6 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 8d1e5294..e62987b0 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index a44ff3cb..f7a9f114 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -29,8 +29,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 35855162..b8cd8390 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 0716455e..3b52d59b 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 98c43b88..a04e8979 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -30,12 +30,12 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index c3336fc7..5c717343 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index 4855225f..b616193d 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -15,4 +15,4 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt index f40daa8a..de4740c9 100644 --- a/requirements/portable/requirements_amd_noavx2.txt +++ b/requirements/portable/requirements_amd_noavx2.txt @@ -15,4 +15,4 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 1ede251e..6310327d 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 26b68bff..f69b58e7 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -15,6 +15,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 456a0499..dafa6bbe 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 7cd2dd34..c02191eb 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index b47b8bbc..456188b4 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 15834f89..7e733967 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkan-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkan-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index afb9e90f..0329a598 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkanavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkanavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/server.py b/server.py index 41a5660d..169578a5 100644 --- a/server.py +++ b/server.py @@ -1,7 +1,6 @@ import os import warnings -import modules.one_click_installer_check from modules import shared from modules.block_requests import OpenMonkeyPatch, RequestBlocker from modules.logging_colors import logger @@ -94,8 +93,8 @@ def create_interface(): 'filter_by_loader': shared.args.loader or 'All' }) - if Path("cache/pfp_character.png").exists(): - Path("cache/pfp_character.png").unlink() + if Path("user_data/cache/pfp_character.png").exists(): + Path("user_data/cache/pfp_character.png").unlink() # css/js strings css = ui.css @@ -112,8 +111,8 @@ def create_interface(): shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements}) # Audio notification - if Path("notification.mp3").exists(): - shared.gradio['audio_notification'] = gr.Audio(interactive=False, value="notification.mp3", elem_id="audio_notification", visible=False) + if Path("user_data/notification.mp3").exists(): + shared.gradio['audio_notification'] = gr.Audio(interactive=False, value="user_data/notification.mp3", elem_id="audio_notification", visible=False) # Floating menus for saving/deleting files ui_file_saving.create_ui() @@ -179,7 +178,7 @@ def create_interface(): ssl_keyfile=shared.args.ssl_keyfile, ssl_certfile=shared.args.ssl_certfile, root_path=shared.args.subpath, - allowed_paths=["cache", "css", "extensions", "js"] + allowed_paths=["css", "js", "extensions", "user_data/cache"] ) @@ -192,10 +191,10 @@ if __name__ == "__main__": settings_file = None if shared.args.settings is not None and Path(shared.args.settings).exists(): settings_file = Path(shared.args.settings) - elif Path('settings.yaml').exists(): - settings_file = Path('settings.yaml') - elif Path('settings.json').exists(): - settings_file = Path('settings.json') + elif Path('user_data/settings.yaml').exists(): + settings_file = Path('user_data/settings.yaml') + elif Path('user_data/settings.json').exists(): + settings_file = Path('user_data/settings.json') if settings_file is not None: logger.info(f"Loading settings from \"{settings_file}\"") diff --git a/start_wsl.bat b/start_wsl.bat deleted file mode 100755 index d7bacead..00000000 --- a/start_wsl.bat +++ /dev/null @@ -1,11 +0,0 @@ -@echo off - -cd /D "%~dp0" - -set PATH=%PATH%;%SystemRoot%\system32 - -@rem sed -i 's/\x0D$//' ./wsl.sh converts newlines to unix format in the wsl script -call wsl -e bash -lic "sed -i 's/\x0D$//' ./wsl.sh; source ./wsl.sh %*" - -:end -pause diff --git a/update_wizard_wsl.bat b/update_wizard_wsl.bat deleted file mode 100755 index 35f0a349..00000000 --- a/update_wizard_wsl.bat +++ /dev/null @@ -1,11 +0,0 @@ -@echo off - -cd /D "%~dp0" - -set PATH=%PATH%;%SystemRoot%\system32 - -@rem sed -i 's/\x0D$//' ./wsl.sh converts newlines to unix format in the wsl script calling wsl.sh with 'update' will run updater -call wsl -e bash -lic "sed -i 's/\x0D$//' ./wsl.sh; source ./wsl.sh update-wizard" - -:end -pause diff --git a/user_data/CMD_FLAGS.txt b/user_data/CMD_FLAGS.txt new file mode 100644 index 00000000..b0f667b0 --- /dev/null +++ b/user_data/CMD_FLAGS.txt @@ -0,0 +1,3 @@ +# Add persistent flags here to use every time you launch the web UI. +# Example: +# --listen --api diff --git a/characters/Assistant.yaml b/user_data/characters/Assistant.yaml similarity index 100% rename from characters/Assistant.yaml rename to user_data/characters/Assistant.yaml diff --git a/characters/Example.png b/user_data/characters/Example.png similarity index 100% rename from characters/Example.png rename to user_data/characters/Example.png diff --git a/characters/Example.yaml b/user_data/characters/Example.yaml similarity index 100% rename from characters/Example.yaml rename to user_data/characters/Example.yaml diff --git a/grammars/arithmetic.gbnf b/user_data/grammars/arithmetic.gbnf similarity index 100% rename from grammars/arithmetic.gbnf rename to user_data/grammars/arithmetic.gbnf diff --git a/grammars/c.gbnf b/user_data/grammars/c.gbnf similarity index 100% rename from grammars/c.gbnf rename to user_data/grammars/c.gbnf diff --git a/grammars/chess.gbnf b/user_data/grammars/chess.gbnf similarity index 100% rename from grammars/chess.gbnf rename to user_data/grammars/chess.gbnf diff --git a/grammars/json.gbnf b/user_data/grammars/json.gbnf similarity index 100% rename from grammars/json.gbnf rename to user_data/grammars/json.gbnf diff --git a/grammars/json_w_trailing_space.gbnf b/user_data/grammars/json_w_trailing_space.gbnf similarity index 100% rename from grammars/json_w_trailing_space.gbnf rename to user_data/grammars/json_w_trailing_space.gbnf diff --git a/grammars/list.gbnf b/user_data/grammars/list.gbnf similarity index 100% rename from grammars/list.gbnf rename to user_data/grammars/list.gbnf diff --git a/grammars/roleplay.gbnf b/user_data/grammars/roleplay.gbnf similarity index 100% rename from grammars/roleplay.gbnf rename to user_data/grammars/roleplay.gbnf diff --git a/grammars/simple_arithmetic.gbnf b/user_data/grammars/simple_arithmetic.gbnf similarity index 100% rename from grammars/simple_arithmetic.gbnf rename to user_data/grammars/simple_arithmetic.gbnf diff --git a/instruction-templates/Airoboros-v1.2.yaml b/user_data/instruction-templates/Airoboros-v1.2.yaml similarity index 100% rename from instruction-templates/Airoboros-v1.2.yaml rename to user_data/instruction-templates/Airoboros-v1.2.yaml diff --git a/instruction-templates/Alpaca.yaml b/user_data/instruction-templates/Alpaca.yaml similarity index 100% rename from instruction-templates/Alpaca.yaml rename to user_data/instruction-templates/Alpaca.yaml diff --git a/instruction-templates/Bactrian.yaml b/user_data/instruction-templates/Bactrian.yaml similarity index 100% rename from instruction-templates/Bactrian.yaml rename to user_data/instruction-templates/Bactrian.yaml diff --git a/instruction-templates/Baichuan Chat.yaml b/user_data/instruction-templates/Baichuan Chat.yaml similarity index 100% rename from instruction-templates/Baichuan Chat.yaml rename to user_data/instruction-templates/Baichuan Chat.yaml diff --git a/instruction-templates/Baize.yaml b/user_data/instruction-templates/Baize.yaml similarity index 100% rename from instruction-templates/Baize.yaml rename to user_data/instruction-templates/Baize.yaml diff --git a/instruction-templates/Bluemoon.yaml b/user_data/instruction-templates/Bluemoon.yaml similarity index 100% rename from instruction-templates/Bluemoon.yaml rename to user_data/instruction-templates/Bluemoon.yaml diff --git a/instruction-templates/ChatGLM.yaml b/user_data/instruction-templates/ChatGLM.yaml similarity index 100% rename from instruction-templates/ChatGLM.yaml rename to user_data/instruction-templates/ChatGLM.yaml diff --git a/instruction-templates/ChatML.yaml b/user_data/instruction-templates/ChatML.yaml similarity index 100% rename from instruction-templates/ChatML.yaml rename to user_data/instruction-templates/ChatML.yaml diff --git a/instruction-templates/Chinese-Vicuna-Chat.yaml b/user_data/instruction-templates/Chinese-Vicuna-Chat.yaml similarity index 100% rename from instruction-templates/Chinese-Vicuna-Chat.yaml rename to user_data/instruction-templates/Chinese-Vicuna-Chat.yaml diff --git a/instruction-templates/Command-R.yaml b/user_data/instruction-templates/Command-R.yaml similarity index 100% rename from instruction-templates/Command-R.yaml rename to user_data/instruction-templates/Command-R.yaml diff --git a/instruction-templates/Galactica Cite.yaml b/user_data/instruction-templates/Galactica Cite.yaml similarity index 100% rename from instruction-templates/Galactica Cite.yaml rename to user_data/instruction-templates/Galactica Cite.yaml diff --git a/instruction-templates/Galactica Finetuned.yaml b/user_data/instruction-templates/Galactica Finetuned.yaml similarity index 100% rename from instruction-templates/Galactica Finetuned.yaml rename to user_data/instruction-templates/Galactica Finetuned.yaml diff --git a/instruction-templates/Galactica Q.yaml b/user_data/instruction-templates/Galactica Q.yaml similarity index 100% rename from instruction-templates/Galactica Q.yaml rename to user_data/instruction-templates/Galactica Q.yaml diff --git a/instruction-templates/Galactica Summary.yaml b/user_data/instruction-templates/Galactica Summary.yaml similarity index 100% rename from instruction-templates/Galactica Summary.yaml rename to user_data/instruction-templates/Galactica Summary.yaml diff --git a/instruction-templates/Galactica Work.yaml b/user_data/instruction-templates/Galactica Work.yaml similarity index 100% rename from instruction-templates/Galactica Work.yaml rename to user_data/instruction-templates/Galactica Work.yaml diff --git a/instruction-templates/Galactica v2.yaml b/user_data/instruction-templates/Galactica v2.yaml similarity index 100% rename from instruction-templates/Galactica v2.yaml rename to user_data/instruction-templates/Galactica v2.yaml diff --git a/instruction-templates/Galactica.yaml b/user_data/instruction-templates/Galactica.yaml similarity index 100% rename from instruction-templates/Galactica.yaml rename to user_data/instruction-templates/Galactica.yaml diff --git a/instruction-templates/Gorilla.yaml b/user_data/instruction-templates/Gorilla.yaml similarity index 100% rename from instruction-templates/Gorilla.yaml rename to user_data/instruction-templates/Gorilla.yaml diff --git a/instruction-templates/Guanaco non-chat.yaml b/user_data/instruction-templates/Guanaco non-chat.yaml similarity index 100% rename from instruction-templates/Guanaco non-chat.yaml rename to user_data/instruction-templates/Guanaco non-chat.yaml diff --git a/instruction-templates/Guanaco-QLoRA.yaml b/user_data/instruction-templates/Guanaco-QLoRA.yaml similarity index 100% rename from instruction-templates/Guanaco-QLoRA.yaml rename to user_data/instruction-templates/Guanaco-QLoRA.yaml diff --git a/instruction-templates/H2O-prompt_answer.yaml b/user_data/instruction-templates/H2O-prompt_answer.yaml similarity index 100% rename from instruction-templates/H2O-prompt_answer.yaml rename to user_data/instruction-templates/H2O-prompt_answer.yaml diff --git a/instruction-templates/Hippogriff.yaml b/user_data/instruction-templates/Hippogriff.yaml similarity index 100% rename from instruction-templates/Hippogriff.yaml rename to user_data/instruction-templates/Hippogriff.yaml diff --git a/instruction-templates/INCITE-Chat.yaml b/user_data/instruction-templates/INCITE-Chat.yaml similarity index 100% rename from instruction-templates/INCITE-Chat.yaml rename to user_data/instruction-templates/INCITE-Chat.yaml diff --git a/instruction-templates/INCITE-Instruct.yaml b/user_data/instruction-templates/INCITE-Instruct.yaml similarity index 100% rename from instruction-templates/INCITE-Instruct.yaml rename to user_data/instruction-templates/INCITE-Instruct.yaml diff --git a/instruction-templates/KoAlpaca.yaml b/user_data/instruction-templates/KoAlpaca.yaml similarity index 100% rename from instruction-templates/KoAlpaca.yaml rename to user_data/instruction-templates/KoAlpaca.yaml diff --git a/instruction-templates/Koala.yaml b/user_data/instruction-templates/Koala.yaml similarity index 100% rename from instruction-templates/Koala.yaml rename to user_data/instruction-templates/Koala.yaml diff --git a/instruction-templates/LLaVA.yaml b/user_data/instruction-templates/LLaVA.yaml similarity index 100% rename from instruction-templates/LLaVA.yaml rename to user_data/instruction-templates/LLaVA.yaml diff --git a/instruction-templates/Llama-v2.yaml b/user_data/instruction-templates/Llama-v2.yaml similarity index 100% rename from instruction-templates/Llama-v2.yaml rename to user_data/instruction-templates/Llama-v2.yaml diff --git a/instruction-templates/Llama-v3.yaml b/user_data/instruction-templates/Llama-v3.yaml similarity index 100% rename from instruction-templates/Llama-v3.yaml rename to user_data/instruction-templates/Llama-v3.yaml diff --git a/instruction-templates/MOSS.yaml b/user_data/instruction-templates/MOSS.yaml similarity index 100% rename from instruction-templates/MOSS.yaml rename to user_data/instruction-templates/MOSS.yaml diff --git a/instruction-templates/Manticore Chat.yaml b/user_data/instruction-templates/Manticore Chat.yaml similarity index 100% rename from instruction-templates/Manticore Chat.yaml rename to user_data/instruction-templates/Manticore Chat.yaml diff --git a/instruction-templates/Metharme.yaml b/user_data/instruction-templates/Metharme.yaml similarity index 100% rename from instruction-templates/Metharme.yaml rename to user_data/instruction-templates/Metharme.yaml diff --git a/instruction-templates/Mistral.yaml b/user_data/instruction-templates/Mistral.yaml similarity index 100% rename from instruction-templates/Mistral.yaml rename to user_data/instruction-templates/Mistral.yaml diff --git a/instruction-templates/NVIDIA-ChatQA.yaml b/user_data/instruction-templates/NVIDIA-ChatQA.yaml similarity index 100% rename from instruction-templates/NVIDIA-ChatQA.yaml rename to user_data/instruction-templates/NVIDIA-ChatQA.yaml diff --git a/instruction-templates/NewHope.yaml b/user_data/instruction-templates/NewHope.yaml similarity index 100% rename from instruction-templates/NewHope.yaml rename to user_data/instruction-templates/NewHope.yaml diff --git a/instruction-templates/Open Assistant.yaml b/user_data/instruction-templates/Open Assistant.yaml similarity index 100% rename from instruction-templates/Open Assistant.yaml rename to user_data/instruction-templates/Open Assistant.yaml diff --git a/instruction-templates/OpenBuddy.yaml b/user_data/instruction-templates/OpenBuddy.yaml similarity index 100% rename from instruction-templates/OpenBuddy.yaml rename to user_data/instruction-templates/OpenBuddy.yaml diff --git a/instruction-templates/OpenChat.yaml b/user_data/instruction-templates/OpenChat.yaml similarity index 100% rename from instruction-templates/OpenChat.yaml rename to user_data/instruction-templates/OpenChat.yaml diff --git a/instruction-templates/OpenOrca-Platypus2.yaml b/user_data/instruction-templates/OpenOrca-Platypus2.yaml similarity index 100% rename from instruction-templates/OpenOrca-Platypus2.yaml rename to user_data/instruction-templates/OpenOrca-Platypus2.yaml diff --git a/instruction-templates/Orca Mini.yaml b/user_data/instruction-templates/Orca Mini.yaml similarity index 100% rename from instruction-templates/Orca Mini.yaml rename to user_data/instruction-templates/Orca Mini.yaml diff --git a/instruction-templates/Orca-Vicuna.yaml b/user_data/instruction-templates/Orca-Vicuna.yaml similarity index 100% rename from instruction-templates/Orca-Vicuna.yaml rename to user_data/instruction-templates/Orca-Vicuna.yaml diff --git a/instruction-templates/RWKV-Raven.yaml b/user_data/instruction-templates/RWKV-Raven.yaml similarity index 100% rename from instruction-templates/RWKV-Raven.yaml rename to user_data/instruction-templates/RWKV-Raven.yaml diff --git a/instruction-templates/RWKV-World.yaml b/user_data/instruction-templates/RWKV-World.yaml similarity index 100% rename from instruction-templates/RWKV-World.yaml rename to user_data/instruction-templates/RWKV-World.yaml diff --git a/instruction-templates/Samantha.yaml b/user_data/instruction-templates/Samantha.yaml similarity index 100% rename from instruction-templates/Samantha.yaml rename to user_data/instruction-templates/Samantha.yaml diff --git a/instruction-templates/StableBeluga2.yaml b/user_data/instruction-templates/StableBeluga2.yaml similarity index 100% rename from instruction-templates/StableBeluga2.yaml rename to user_data/instruction-templates/StableBeluga2.yaml diff --git a/instruction-templates/StableLM.yaml b/user_data/instruction-templates/StableLM.yaml similarity index 100% rename from instruction-templates/StableLM.yaml rename to user_data/instruction-templates/StableLM.yaml diff --git a/instruction-templates/StableVicuna.yaml b/user_data/instruction-templates/StableVicuna.yaml similarity index 100% rename from instruction-templates/StableVicuna.yaml rename to user_data/instruction-templates/StableVicuna.yaml diff --git a/instruction-templates/Starchat-Beta.yaml b/user_data/instruction-templates/Starchat-Beta.yaml similarity index 100% rename from instruction-templates/Starchat-Beta.yaml rename to user_data/instruction-templates/Starchat-Beta.yaml diff --git a/instruction-templates/Synthia-CoT.yaml b/user_data/instruction-templates/Synthia-CoT.yaml similarity index 100% rename from instruction-templates/Synthia-CoT.yaml rename to user_data/instruction-templates/Synthia-CoT.yaml diff --git a/instruction-templates/Synthia.yaml b/user_data/instruction-templates/Synthia.yaml similarity index 100% rename from instruction-templates/Synthia.yaml rename to user_data/instruction-templates/Synthia.yaml diff --git a/instruction-templates/Tulu.yaml b/user_data/instruction-templates/Tulu.yaml similarity index 100% rename from instruction-templates/Tulu.yaml rename to user_data/instruction-templates/Tulu.yaml diff --git a/instruction-templates/Vicuna-v0.yaml b/user_data/instruction-templates/Vicuna-v0.yaml similarity index 100% rename from instruction-templates/Vicuna-v0.yaml rename to user_data/instruction-templates/Vicuna-v0.yaml diff --git a/instruction-templates/Vicuna-v1.1.yaml b/user_data/instruction-templates/Vicuna-v1.1.yaml similarity index 100% rename from instruction-templates/Vicuna-v1.1.yaml rename to user_data/instruction-templates/Vicuna-v1.1.yaml diff --git a/instruction-templates/Vigogne-Chat.yaml b/user_data/instruction-templates/Vigogne-Chat.yaml similarity index 100% rename from instruction-templates/Vigogne-Chat.yaml rename to user_data/instruction-templates/Vigogne-Chat.yaml diff --git a/instruction-templates/Vigogne-Instruct.yaml b/user_data/instruction-templates/Vigogne-Instruct.yaml similarity index 100% rename from instruction-templates/Vigogne-Instruct.yaml rename to user_data/instruction-templates/Vigogne-Instruct.yaml diff --git a/instruction-templates/Wizard-Mega ShareGPT.yaml b/user_data/instruction-templates/Wizard-Mega ShareGPT.yaml similarity index 100% rename from instruction-templates/Wizard-Mega ShareGPT.yaml rename to user_data/instruction-templates/Wizard-Mega ShareGPT.yaml diff --git a/instruction-templates/Wizard-Mega.yaml b/user_data/instruction-templates/Wizard-Mega.yaml similarity index 100% rename from instruction-templates/Wizard-Mega.yaml rename to user_data/instruction-templates/Wizard-Mega.yaml diff --git a/instruction-templates/Ziya.yaml b/user_data/instruction-templates/Ziya.yaml similarity index 100% rename from instruction-templates/Ziya.yaml rename to user_data/instruction-templates/Ziya.yaml diff --git a/loras/place-your-loras-here.txt b/user_data/loras/place-your-loras-here.txt similarity index 100% rename from loras/place-your-loras-here.txt rename to user_data/loras/place-your-loras-here.txt diff --git a/models/config.yaml b/user_data/models/config.yaml similarity index 100% rename from models/config.yaml rename to user_data/models/config.yaml diff --git a/models/place-your-models-here.txt b/user_data/models/place-your-models-here.txt similarity index 100% rename from models/place-your-models-here.txt rename to user_data/models/place-your-models-here.txt diff --git a/presets/Contrastive Search.yaml b/user_data/presets/Contrastive Search.yaml similarity index 100% rename from presets/Contrastive Search.yaml rename to user_data/presets/Contrastive Search.yaml diff --git a/presets/Creative.yaml b/user_data/presets/Creative.yaml similarity index 100% rename from presets/Creative.yaml rename to user_data/presets/Creative.yaml diff --git a/presets/Deterministic.yaml b/user_data/presets/Deterministic.yaml similarity index 100% rename from presets/Deterministic.yaml rename to user_data/presets/Deterministic.yaml diff --git a/presets/Instruct.yaml b/user_data/presets/Instruct.yaml similarity index 100% rename from presets/Instruct.yaml rename to user_data/presets/Instruct.yaml diff --git a/presets/Null preset.yaml b/user_data/presets/Null preset.yaml similarity index 100% rename from presets/Null preset.yaml rename to user_data/presets/Null preset.yaml diff --git a/presets/min_p.yaml b/user_data/presets/min_p.yaml similarity index 100% rename from presets/min_p.yaml rename to user_data/presets/min_p.yaml diff --git a/prompts/Alpaca-with-Input.txt b/user_data/prompts/Alpaca-with-Input.txt similarity index 100% rename from prompts/Alpaca-with-Input.txt rename to user_data/prompts/Alpaca-with-Input.txt diff --git a/prompts/QA.txt b/user_data/prompts/QA.txt similarity index 100% rename from prompts/QA.txt rename to user_data/prompts/QA.txt diff --git a/settings-template.yaml b/user_data/settings-template.yaml similarity index 98% rename from settings-template.yaml rename to user_data/settings-template.yaml index 0343df0a..83764f97 100644 --- a/settings-template.yaml +++ b/user_data/settings-template.yaml @@ -1,6 +1,6 @@ show_controls: true start_with: '' -mode: chat-instruct +mode: instruct chat_style: cai-chat chat-instruct_command: |- Continue the chat dialogue below. Write a single reply for the character "<|character|>". @@ -29,7 +29,6 @@ truncation_length: 8192 seed: -1 custom_stopping_strings: '' custom_token_bans: '' -show_after: '' negative_prompt: '' autoload_model: false dark_theme: true diff --git a/training/datasets/put-trainer-datasets-here.txt b/user_data/training/datasets/put-trainer-datasets-here.txt similarity index 100% rename from training/datasets/put-trainer-datasets-here.txt rename to user_data/training/datasets/put-trainer-datasets-here.txt diff --git a/training/formats/ChatML-format.json b/user_data/training/formats/ChatML-format.json similarity index 100% rename from training/formats/ChatML-format.json rename to user_data/training/formats/ChatML-format.json diff --git a/training/formats/alpaca-chatbot-format.json b/user_data/training/formats/alpaca-chatbot-format.json similarity index 100% rename from training/formats/alpaca-chatbot-format.json rename to user_data/training/formats/alpaca-chatbot-format.json diff --git a/training/formats/alpaca-format.json b/user_data/training/formats/alpaca-format.json similarity index 100% rename from training/formats/alpaca-format.json rename to user_data/training/formats/alpaca-format.json diff --git a/training/formats/llama2-chat-format.json b/user_data/training/formats/llama2-chat-format.json similarity index 100% rename from training/formats/llama2-chat-format.json rename to user_data/training/formats/llama2-chat-format.json diff --git a/training/formats/vicuna-format.json b/user_data/training/formats/vicuna-format.json similarity index 100% rename from training/formats/vicuna-format.json rename to user_data/training/formats/vicuna-format.json diff --git a/wsl.sh b/wsl.sh deleted file mode 100755 index c5d28b16..00000000 --- a/wsl.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash - -# detect if build-essential is missing or broken -if ! dpkg-query -W -f'${Status}' "build-essential" 2>/dev/null | grep -q "ok installed"; then -echo "build-essential not found or broken! - -A C++ compiler is required to build needed Python packages! -To install one, run cmd_wsl.bat and enter these commands: - -sudo apt-get update -sudo apt-get install build-essential -" -read -n1 -p "Continue the installer anyway? [y,n]" EXIT_PROMPT -# only continue if user inputs 'y' else exit -if ! [[ $EXIT_PROMPT == "Y" || $EXIT_PROMPT == "y" ]]; then exit; fi -fi - -# deactivate existing conda envs as needed to avoid conflicts -{ conda deactivate && conda deactivate && conda deactivate; } 2> /dev/null - -# config unlike other scripts, can't use current directory due to file IO bug in WSL, needs to be in virtual drive -INSTALL_DIR_PREFIX="$HOME/text-gen-install" -if [[ ! $(realpath "$(pwd)/..") = /mnt/* ]]; then - INSTALL_DIR_PREFIX="$(realpath "$(pwd)/..")" && INSTALL_INPLACE=1 -fi -INSTALL_DIR="$INSTALL_DIR_PREFIX/text-generation-webui" -CONDA_ROOT_PREFIX="$INSTALL_DIR/installer_files/conda" -INSTALL_ENV_DIR="$INSTALL_DIR/installer_files/env" -MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py311_24.11.1-0-Linux-x86_64.sh" -conda_exists="F" - -# environment isolation -export PYTHONNOUSERSITE=1 -unset PYTHONPATH -unset PYTHONHOME -export CUDA_PATH="$INSTALL_ENV_DIR" -export CUDA_HOME="$CUDA_PATH" - -# /usr/lib/wsl/lib needs to be added to LD_LIBRARY_PATH to fix years-old bug in WSL where GPU drivers aren't linked properly -export LD_LIBRARY_PATH="$CUDA_HOME/lib:/usr/lib/wsl/lib:$LD_LIBRARY_PATH" - -# open bash cli if called with 'wsl.sh cmd' with workarounds for existing conda -if [ "$1" == "cmd" ]; then - exec bash --init-file <(echo ". ~/.bashrc; conda deactivate 2> /dev/null; cd $INSTALL_DIR || cd $HOME; source $CONDA_ROOT_PREFIX/etc/profile.d/conda.sh; conda activate $INSTALL_ENV_DIR") - exit -fi - -if [[ "$INSTALL_DIR" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi - -# create install dir if missing -if [ ! -d "$INSTALL_DIR" ]; then mkdir -p "$INSTALL_DIR" || exit; fi - -# figure out whether git and conda needs to be installed -if "$CONDA_ROOT_PREFIX/bin/conda" --version &>/dev/null; then conda_exists="T"; fi - -# (if necessary) install git and conda into a contained environment -# download miniconda -if [ "$conda_exists" == "F" ]; then - echo "Downloading Miniconda from $MINICONDA_DOWNLOAD_URL to $INSTALL_DIR/miniconda_installer.sh" - - curl -L "$MINICONDA_DOWNLOAD_URL" > "$INSTALL_DIR/miniconda_installer.sh" - - chmod u+x "$INSTALL_DIR/miniconda_installer.sh" - bash "$INSTALL_DIR/miniconda_installer.sh" -b -p $CONDA_ROOT_PREFIX - - # test the conda binary - echo "Miniconda version:" - "$CONDA_ROOT_PREFIX/bin/conda" --version - - # delete the Miniconda installer - rm "$INSTALL_DIR/miniconda_installer.sh" -fi - -# create the installer env -if [ ! -e "$INSTALL_ENV_DIR" ]; then - "$CONDA_ROOT_PREFIX/bin/conda" create -y -k --prefix "$INSTALL_ENV_DIR" python=3.11 git -fi - -# check if conda environment was actually created -if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then - echo "Conda environment is empty." - exit -fi - -# activate installer env -source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh" # otherwise conda complains about 'shell not initialized' (needed when running in a script) -conda activate "$INSTALL_ENV_DIR" - -pushd $INSTALL_DIR 1> /dev/null || exit - -if [ ! -f "./server.py" ]; then - git init -b main - git remote add origin https://github.com/oobabooga/text-generation-webui - git fetch - git remote set-head origin -a - git reset origin/HEAD --hard - git branch --set-upstream-to=origin/HEAD - git restore -- . :!./CMD_FLAGS.txt -fi - -# copy CMD_FLAGS.txt to install dir to allow edits within Windows -if [[ $INSTALL_INPLACE != 1 ]]; then - # workaround for old install migration - if [ ! -f "./wsl.sh" ]; then - git pull || exit - [ -f "../webui.py" ] && mv "../webui.py" "../webui-old.py" - fi - if [ -f "$(dirs +1)/CMD_FLAGS.txt" ] && [ -f "./CMD_FLAGS.txt" ]; then cp -u "$(dirs +1)/CMD_FLAGS.txt" "$INSTALL_DIR"; fi -fi - -# setup installer env update env if called with 'wsl.sh update' -case "$1" in -("update-wizard") python one_click.py --update-wizard;; -(*) python one_click.py $@;; -esac