diff --git a/README.md b/README.md index c3f7d8a6..d350d959 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ conda install -y -c "nvidia/label/cuda-12.8.1" cuda ``` git clone https://github.com/oobabooga/text-generation-webui cd text-generation-webui -pip install -r +pip install -r requirements/full/ ``` Requirements file to use: @@ -240,16 +240,16 @@ List of command-line flags ```txt usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS] - [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] - [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] - [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock] - [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--mmproj MMPROJ] [--ctx-size N] [--cache-type N] - [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--enable-tp] - [--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner] [--deepspeed] - [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] - [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] - [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] - [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] + [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--ctx-size N] [--cache-type N] [--model-draft MODEL_DRAFT] + [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-layers N] [--mmproj MMPROJ] [--streaming-llm] + [--tensor-split TENSOR_SPLIT] [--row-split] [--no-mmap] [--mlock] [--no-kv-offload] [--batch-size BATCH_SIZE] [--threads THREADS] [--threads-batch THREADS_BATCH] [--numa] + [--extra-flags EXTRA_FLAGS] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] + [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] + [--enable-tp] [--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner] + [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] + [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] + [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] + [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] Text Generation Web UI @@ -273,6 +273,33 @@ Model loader: --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM. +Context and cache: + --ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens. + --cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits + separately, e.g. q4_q8). + +Speculative decoding: + --model-draft MODEL_DRAFT Path to the draft model for speculative decoding. + --draft-max DRAFT_MAX Number of tokens to draft for speculative decoding. + --gpu-layers-draft GPU_LAYERS_DRAFT Number of layers to offload to the GPU for the draft model. + --device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1 + --ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model. + +llama.cpp: + --gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU. + --mmproj MMPROJ Path to the mmproj file for vision models. + --streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed. + --tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40. + --row-split Split the model by rows across GPUs. This may improve multi-gpu performance. + --no-mmap Prevent mmap from being used. + --mlock Force the system to keep the model in RAM. + --no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. + --batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval. + --threads THREADS Number of threads to use. + --threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing. + --numa Activate NUMA task allocation for llama.cpp. + --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU" + Transformers/Accelerate: --cpu Use the CPU to generate text. Warning: Training on CPU is extremely slow. --cpu-memory CPU_MEMORY Maximum CPU memory in GiB. Use this for CPU offloading. @@ -292,34 +319,6 @@ bitsandbytes 4-bit: --compute_dtype COMPUTE_DTYPE compute dtype for 4-bit. Valid options: bfloat16, float16, float32. --quant_type QUANT_TYPE quant_type for 4-bit. Valid options: nf4, fp4. -llama.cpp: - --flash-attn Use flash-attention. - --threads THREADS Number of threads to use. - --threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing. - --batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval. - --no-mmap Prevent mmap from being used. - --mlock Force the system to keep the model in RAM. - --gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU. - --tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40. - --numa Activate NUMA task allocation for llama.cpp. - --no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. - --row-split Split the model by rows across GPUs. This may improve multi-gpu performance. - --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU" - --streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed. - --mmproj MMPROJ Path to the mmproj file for vision models. - -Context and cache: - --ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens. - --cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits - separately, e.g. q4_q8). - -Speculative decoding: - --model-draft MODEL_DRAFT Path to the draft model for speculative decoding. - --draft-max DRAFT_MAX Number of tokens to draft for speculative decoding. - --gpu-layers-draft GPU_LAYERS_DRAFT Number of layers to offload to the GPU for the draft model. - --device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1 - --ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model. - ExLlamaV3: --enable-tp, --enable_tp Enable Tensor Parallelism (TP) to split the model across GPUs. --tp-backend TP_BACKEND The backend for tensor parallelism. Valid options: native, nccl. Default: native. diff --git a/css/main.css b/css/main.css index c7ee57da..fd79d24c 100644 --- a/css/main.css +++ b/css/main.css @@ -22,10 +22,6 @@ font-style: italic; } -.tabs.svelte-710i53 { - margin-top: 0 -} - .padded.svelte-12cmxck { padding: 3px 0; } @@ -126,10 +122,6 @@ span.math.inline { vertical-align: baseline !important; } -div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * { - flex-wrap: nowrap; -} - gradio-app > :first-child { padding: 0 !important; } @@ -325,7 +317,7 @@ audio { /* ---------------------------------------------- Chat tab ---------------------------------------------- */ -.h-\[40dvh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx { +.h-\[40dvh\] { height: 66.67dvh } @@ -338,23 +330,10 @@ audio { width: unset } -div.svelte-362y77>*, div.svelte-362y77>.form>* { - flex-wrap: nowrap -} - .pending.svelte-1ed2p3z { opacity: 1; } -.wrap.svelte-6roggh.svelte-6roggh { - max-height: 92.5%; -} - -/* This is for the microphone button in the whisper extension */ -.sm.svelte-1ipelgc { - width: 100%; -} - #chat-tab { padding: 0; } diff --git a/download-model.py b/download-model.py index c0a3aa36..756d529f 100644 --- a/download-model.py +++ b/download-model.py @@ -242,9 +242,19 @@ class ModelDownloader: try: if output_path.exists() and not start_from_scratch: current_file_size_on_disk = output_path.stat().st_size - r_head = session.head(url, timeout=20) - r_head.raise_for_status() - total_size = int(r_head.headers.get('content-length', 0)) + + # Make a HEAD request without following redirects to get metadata first + r_head = session.head(url, timeout=20, allow_redirects=True) + r_head.raise_for_status() # Will raise an error for 4xx or 5xx status codes + + # Check for the new 'x-linked-size' header from Hugging Face + if 'x-linked-size' in r_head.headers: + total_size = int(r_head.headers['x-linked-size']) + # Fallback to the old 'content-length' just in case + elif 'content-length' in r_head.headers: + total_size = int(r_head.headers.get('content-length', 0)) + else: + total_size = 0 if current_file_size_on_disk >= total_size and total_size > 0: if self.progress_queue is not None and total_size > 0: diff --git a/modules/chat.py b/modules/chat.py index 9857479a..55984d7a 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -880,7 +880,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess # Extract the reply if state['mode'] in ['chat', 'chat-instruct']: - reply = reply.lstrip() + if not _continue: + reply = reply.lstrip() + if reply.startswith(state['name2'] + ':'): reply = reply[len(state['name2'] + ':'):] elif reply.startswith(state['name1'] + ':'): @@ -1831,6 +1833,10 @@ def handle_branch_chat_click(state): history = state['history'] history['visible'] = history['visible'][:branch_from_index + 1] history['internal'] = history['internal'][:branch_from_index + 1] + # Prune the metadata dictionary to remove entries beyond the branch point + if 'metadata' in history: + history['metadata'] = {k: v for k, v in history['metadata'].items() if int(k.split('_')[-1]) <= branch_from_index} + new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S') save_history(history, new_unique_id, state['character_menu'], state['mode']) diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index c606912b..e05f8d7d 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -4,8 +4,6 @@ from pathlib import Path from typing import Any, Dict, Optional, Union import torch -from exllamav3 import Cache, Config, Model -from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant from torch.nn import CrossEntropyLoss from transformers import ( GenerationConfig, @@ -15,6 +13,8 @@ from transformers import ( ) from transformers.modeling_outputs import CausalLMOutputWithPast +from exllamav3 import Cache, Config, Model +from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant from modules import shared from modules.logging_colors import logger @@ -103,12 +103,6 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): labels = kwargs.get('labels', None) past_key_values = kwargs.get('past_key_values', None) - # Reset the internal sequence state for standalone calls (logit viewer) - # or the very first step of a new generation. - if past_key_values is None: - self.past_seq = None - self.past_seq_negative = None - if len(args) > 0: if not shared.args.cfg_cache: logger.error("Please enable the cfg-cache option to use CFG with ExLlamav3_HF.") @@ -125,8 +119,8 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): ex_cache = self.ex_cache seq = input_ids[0].tolist() - if is_negative and past_key_values is not None and isinstance(past_key_values, list): - seq = past_key_values + seq + if is_negative and past_key_values is not None: + seq = past_key_values + seq seq_tensor = torch.tensor(seq) reset = True @@ -134,50 +128,97 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): # Maximum number of tokens to process in a single forward pass max_chunk_size = 256 - if past_seq is not None: - min_length = min(past_seq.shape[0], seq_tensor.shape[0]) - indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length])) - if len(indices) == 0 and seq_tensor.shape[0] > past_seq.shape[0]: - reset = False - - # Create a single `params` dictionary that will be used and modified - # in-place across all `forward` calls within this function. - params = { - "attn_mode": "flash_attn", - "cache": ex_cache, - "batch_shape": (1, self.max_tokens), - "reconstruct": False, - "past_len": 0 - } - # Make the forward call if labels is None: - # If it's an efficient continuation, process only the new tokens - if not reset: - params["past_len"] = past_seq.shape[0] - tokens_to_process = seq_tensor[past_seq.shape[0]:] - # Otherwise, process the whole sequence from scratch - else: - tokens_to_process = seq_tensor + if past_seq is not None: + min_length = min(past_seq.shape[0], seq_tensor.shape[0]) + indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length])) + if len(indices) > 0: + longest_prefix = indices[0].item() + else: + longest_prefix = min_length - # Process all but the last token of the sequence/sub-sequence - if tokens_to_process.shape[0] > 1: - prefix_to_process = tokens_to_process[:-1] + if longest_prefix > 0: + reset = False + current_len = longest_prefix + remaining_tokens = len(seq_tensor) - longest_prefix - 1 - # Process in chunks if the number of tokens is large - for i in range(0, prefix_to_process.shape[0], max_chunk_size): - chunk = prefix_to_process[i:i + max_chunk_size] - self.ex_model.forward(input_ids=chunk.view(1, -1), params=params) - params["past_len"] += chunk.shape[0] + if remaining_tokens > 0: + # Process tokens from longest_prefix to second-to-last token + tokens_to_process = seq_tensor[longest_prefix:-1] - # Process the last token to get logits - last_token = tokens_to_process[-1:].view(1, -1) - logits = self.ex_model.forward(input_ids=last_token, params=params).to(input_ids.device).float() + # Process in chunks if the number of tokens is large + for i in range(0, tokens_to_process.shape[0], max_chunk_size): + chunk = tokens_to_process[i:i + max_chunk_size] + self.ex_model.forward( + input_ids=chunk.view(1, -1), + params={ + "attn_mode": "flash_attn", + "cache": ex_cache, + "past_len": longest_prefix + i, + "batch_shape": (1, self.max_tokens), + "reconstruct": False # Force memory-efficient path + } + ) + + current_len = longest_prefix + remaining_tokens + + if reset: + if len(seq_tensor) > 1: + # Process all tokens except the last one + tokens_to_process = seq_tensor[:-1] + + # Process in chunks if the number of tokens is large + current_len = 0 + for i in range(0, tokens_to_process.shape[0], max_chunk_size): + chunk = tokens_to_process[i:i + max_chunk_size] + self.ex_model.forward( + input_ids=chunk.view(1, -1), + params={ + "attn_mode": "flash_attn", + "cache": ex_cache, + "past_len": current_len, + "batch_shape": (1, self.max_tokens), + "reconstruct": False # Force memory-efficient path + } + ) + current_len += chunk.shape[0] + else: + current_len = 0 + + # Process the last token and get logits + logits = self.ex_model.forward( + input_ids=seq_tensor[-1:].view(1, -1), + params={ + "attn_mode": "flash_attn", + "cache": ex_cache, + "past_len": current_len, + "batch_shape": (1, self.max_tokens), + "reconstruct": False # Force memory-efficient path + } + ).to(input_ids.device).float() else: # When processing with labels, handle as a complete sequence - params["attn_mode"] = "flash_attn_nc" - logits = self.ex_model.forward(input_ids=seq_tensor.view(1,-1), params=params).float() + # Process in chunks if the number of tokens is large + tokens_to_process = seq_tensor + all_logits = None + for i in range(0, tokens_to_process.shape[0], max_chunk_size): + chunk = tokens_to_process[i:i + max_chunk_size] + chunk_logits = self.ex_model.forward( + input_ids=chunk.view(1, -1), + params={ + "attn_mode": "flash_attn_nc", # No caching for training + "reconstruct": False # Force memory-efficient path + } + ).float() + + if all_logits is None: + all_logits = chunk_logits + else: + all_logits = torch.cat([all_logits, chunk_logits], dim=1) + + logits = all_logits if is_negative: self.past_seq_negative = seq_tensor diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 38589cf2..27890d8c 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -200,7 +200,10 @@ class LlamaServer: # Make the generation request response = self.session.post(url, json=payload, stream=True) try: - response.raise_for_status() # Raise an exception for HTTP errors + if response.status_code == 400 and response.json()["error"]["type"] == "exceed_context_size_error": + logger.error("The request exceeds the available context size, try increasing it") + else: + response.raise_for_status() # Raise an exception for HTTP errors full_text = "" diff --git a/modules/loaders.py b/modules/loaders.py index fe982ab5..609a54c6 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -45,7 +45,6 @@ loaders_and_params = OrderedDict({ 'disk', 'use_double_quant', 'bf16', - 'trust_remote_code', 'no_use_fast', ], 'ExLlamav3_HF': [ @@ -53,7 +52,6 @@ loaders_and_params = OrderedDict({ 'cache_type', 'gpu_split', 'cfg_cache', - 'trust_remote_code', 'no_use_fast', 'enable_tp', 'tp_backend', @@ -82,7 +80,6 @@ loaders_and_params = OrderedDict({ 'no_xformers', 'no_sdpa', 'cfg_cache', - 'trust_remote_code', 'no_use_fast', ], 'ExLlamav2': [ diff --git a/modules/shared.py b/modules/shared.py index 4daf43c9..e54ba654 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -50,6 +50,35 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft group = parser.add_argument_group('Model loader') group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.') +# Cache +group = parser.add_argument_group('Context and cache') +group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.') +group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).') + +# Speculative decoding +group = parser.add_argument_group('Speculative decoding') +group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.') +group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to draft for speculative decoding.') +group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of layers to offload to the GPU for the draft model.') +group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1') +group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.') + +# llama.cpp +group = parser.add_argument_group('llama.cpp') +group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.') +group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.') +group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') +group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.') +group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.') +group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') +group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') +group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') +group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') +group.add_argument('--threads', type=int, default=0, help='Number of threads to use.') +group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') +group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') +group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"') + # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.') @@ -71,35 +100,6 @@ group.add_argument('--use_double_quant', action='store_true', help='use_double_q group.add_argument('--compute_dtype', type=str, default='float16', help='compute dtype for 4-bit. Valid options: bfloat16, float16, float32.') group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.') -# llama.cpp -group = parser.add_argument_group('llama.cpp') -group.add_argument('--threads', type=int, default=0, help='Number of threads to use.') -group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') -group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') -group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') -group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') -group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.') -group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.') -group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') -group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') -group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.') -group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"') -group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') -group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.') - -# Cache -group = parser.add_argument_group('Context and cache') -group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.') -group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).') - -# Speculative decoding -group = parser.add_argument_group('Speculative decoding') -group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.') -group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to draft for speculative decoding.') -group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of layers to offload to the GPU for the draft model.') -group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1') -group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.') - # ExLlamaV3 group = parser.add_argument_group('ExLlamaV3') group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) to split the model across GPUs.') @@ -174,6 +174,7 @@ if cmd_flags_path.exists(): args = parser.parse_args() +original_args = copy.deepcopy(args) args_defaults = parser.parse_args([]) # Create a mapping of all argument aliases to their canonical names @@ -295,7 +296,13 @@ default_settings = copy.deepcopy(settings) def do_cmd_flags_warnings(): # Security warnings if args.trust_remote_code: - logger.warning('trust_remote_code is enabled. This is dangerous.') + logger.warning( + "The `--trust-remote-code` flag is enabled.\n" + "This allows models to execute arbitrary code on your machine.\n\n" + "1. Only use with models from sources you fully trust.\n" + "2. Set an access password with `--gradio-auth`." + ) + if 'COLAB_GPU' not in os.environ and not args.nowebui: if args.share: logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.") diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py index 7866f448..f1af1299 100644 --- a/modules/transformers_loader.py +++ b/modules/transformers_loader.py @@ -123,7 +123,7 @@ def load_tokenizer(model_name, tokenizer_dir=None): tokenizer = AutoTokenizer.from_pretrained( path_to_model, - trust_remote_code=shared.args.trust_remote_code, + trust_remote_code=shared.original_args.trust_remote_code, use_fast=not shared.args.no_use_fast ) @@ -140,13 +140,13 @@ def load_model_HF(model_name): 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16, } - if shared.args.trust_remote_code: + if shared.original_args.trust_remote_code: params['trust_remote_code'] = True if shared.args.force_safetensors: params['force_safetensors'] = True - config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) + config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.original_args.trust_remote_code) if 'chatglm' in model_name.lower(): LoaderClass = AutoModel diff --git a/modules/ui.py b/modules/ui.py index 12f43768..76533767 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -160,7 +160,6 @@ def list_model_elements(): 'no_sdpa', 'cfg_cache', 'cpp_runner', - 'trust_remote_code', 'no_use_fast', 'model_draft', 'draft_max', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 729700d4..50ada9f9 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -57,7 +57,6 @@ def create_ui(): shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).') shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.') - shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') # Multimodal diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 97eac769..0609f69b 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -6,7 +6,6 @@ datasets einops fastapi==0.112.4 flash-linear-attention==0.3.2 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown @@ -15,7 +14,7 @@ pandas peft==0.17.* Pillow>=9.5.0 psutil -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -30,16 +29,20 @@ triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.10/exllamav3-0.0.10+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.10/exllamav3-0.0.10+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index b3b0005e..73d35185 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -4,7 +4,6 @@ colorama datasets einops fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown @@ -13,7 +12,7 @@ pandas peft==0.17.* Pillow>=9.5.0 psutil -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -28,13 +27,17 @@ triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 5e0d375e..dad354e6 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -4,7 +4,6 @@ colorama datasets einops fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown @@ -13,7 +12,7 @@ pandas peft==0.17.* Pillow>=9.5.0 psutil -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -28,13 +27,17 @@ triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 0bb837ba..ab48c839 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -4,7 +4,6 @@ colorama datasets einops fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown @@ -13,7 +12,7 @@ pandas peft==0.17.* Pillow>=9.5.0 psutil -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -28,11 +27,15 @@ triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 514c0662..3a341b8a 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -4,7 +4,6 @@ colorama datasets einops fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown @@ -13,7 +12,7 @@ pandas peft==0.17.* Pillow>=9.5.0 psutil -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -28,12 +27,16 @@ triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index f68fdd9d..3765d329 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -4,7 +4,6 @@ colorama datasets einops fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown @@ -13,7 +12,7 @@ pandas peft==0.17.* Pillow>=9.5.0 psutil -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -28,11 +27,15 @@ triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index b40f1af2..a3e4e516 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -4,7 +4,6 @@ colorama datasets einops fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown @@ -13,7 +12,7 @@ pandas peft==0.17.* Pillow>=9.5.0 psutil -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -28,11 +27,15 @@ triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 9de9e65a..abdd262e 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -6,7 +6,6 @@ datasets einops fastapi==0.112.4 flash-linear-attention==0.3.2 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown @@ -15,7 +14,7 @@ pandas peft==0.17.* Pillow>=9.5.0 psutil -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -30,16 +29,20 @@ triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.10/exllamav3-0.0.10+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav3/releases/download/v0.0.10/exllamav3-0.0.10+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 3bd20dd9..47b0dcbd 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -4,7 +4,6 @@ colorama datasets einops fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown @@ -13,7 +12,7 @@ pandas peft==0.17.* Pillow>=9.5.0 psutil -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -28,6 +27,10 @@ triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 7a38b1e6..d43007d0 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -1,11 +1,10 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -13,11 +12,15 @@ requests rich tqdm +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 047d1c54..77a66dd8 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -1,11 +1,10 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -13,12 +12,16 @@ requests rich tqdm +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" \ No newline at end of file +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 5c8ae4df..a2479e98 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -1,11 +1,10 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -13,12 +12,16 @@ requests rich tqdm +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index f41efd58..8d8e5233 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -1,11 +1,10 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -13,11 +12,15 @@ requests rich tqdm +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 69158050..08b7240c 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -1,11 +1,10 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -13,11 +12,15 @@ requests rich tqdm +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index ca66098c..857c5e66 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -1,11 +1,10 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -13,11 +12,15 @@ requests rich tqdm +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index be624bb1..1a7b5683 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -1,11 +1,10 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -13,6 +12,10 @@ requests rich tqdm +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 36aff361..d2e66763 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -1,11 +1,10 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -13,11 +12,15 @@ requests rich tqdm +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index be7170e3..34927294 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -1,11 +1,10 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -gradio==4.37.* html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* -pydantic==2.8.2 +pydantic==2.11.0 PyPDF2==3.0.1 python-docx==1.1.2 pyyaml @@ -13,11 +12,15 @@ requests rich tqdm +# Gradio +gradio==4.37.* +https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl + # API flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"