mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-12-06 07:12:10 +01:00
commit
771130532c
77
README.md
77
README.md
|
|
@ -164,7 +164,7 @@ conda install -y -c "nvidia/label/cuda-12.8.1" cuda
|
||||||
```
|
```
|
||||||
git clone https://github.com/oobabooga/text-generation-webui
|
git clone https://github.com/oobabooga/text-generation-webui
|
||||||
cd text-generation-webui
|
cd text-generation-webui
|
||||||
pip install -r <requirements file according to table below>
|
pip install -r requirements/full/<requirements file according to table below>
|
||||||
```
|
```
|
||||||
|
|
||||||
Requirements file to use:
|
Requirements file to use:
|
||||||
|
|
@ -240,16 +240,16 @@ List of command-line flags
|
||||||
|
|
||||||
```txt
|
```txt
|
||||||
usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
|
usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
|
||||||
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
|
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--ctx-size N] [--cache-type N] [--model-draft MODEL_DRAFT]
|
||||||
[--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant]
|
[--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-layers N] [--mmproj MMPROJ] [--streaming-llm]
|
||||||
[--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock]
|
[--tensor-split TENSOR_SPLIT] [--row-split] [--no-mmap] [--mlock] [--no-kv-offload] [--batch-size BATCH_SIZE] [--threads THREADS] [--threads-batch THREADS_BATCH] [--numa]
|
||||||
[--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--mmproj MMPROJ] [--ctx-size N] [--cache-type N]
|
[--extra-flags EXTRA_FLAGS] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code]
|
||||||
[--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--enable-tp]
|
[--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE]
|
||||||
[--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner] [--deepspeed]
|
[--enable-tp] [--tp-backend TP_BACKEND] [--gpu-split GPU_SPLIT] [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--cpp-runner]
|
||||||
[--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen]
|
[--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB]
|
||||||
[--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE]
|
[--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH]
|
||||||
[--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY]
|
[--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT]
|
||||||
[--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
|
[--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
|
||||||
|
|
||||||
Text Generation Web UI
|
Text Generation Web UI
|
||||||
|
|
||||||
|
|
@ -273,6 +273,33 @@ Model loader:
|
||||||
--loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
|
--loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
|
||||||
TensorRT-LLM.
|
TensorRT-LLM.
|
||||||
|
|
||||||
|
Context and cache:
|
||||||
|
--ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens.
|
||||||
|
--cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits
|
||||||
|
separately, e.g. q4_q8).
|
||||||
|
|
||||||
|
Speculative decoding:
|
||||||
|
--model-draft MODEL_DRAFT Path to the draft model for speculative decoding.
|
||||||
|
--draft-max DRAFT_MAX Number of tokens to draft for speculative decoding.
|
||||||
|
--gpu-layers-draft GPU_LAYERS_DRAFT Number of layers to offload to the GPU for the draft model.
|
||||||
|
--device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1
|
||||||
|
--ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model.
|
||||||
|
|
||||||
|
llama.cpp:
|
||||||
|
--gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU.
|
||||||
|
--mmproj MMPROJ Path to the mmproj file for vision models.
|
||||||
|
--streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
|
||||||
|
--tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
|
||||||
|
--row-split Split the model by rows across GPUs. This may improve multi-gpu performance.
|
||||||
|
--no-mmap Prevent mmap from being used.
|
||||||
|
--mlock Force the system to keep the model in RAM.
|
||||||
|
--no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
|
||||||
|
--batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval.
|
||||||
|
--threads THREADS Number of threads to use.
|
||||||
|
--threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing.
|
||||||
|
--numa Activate NUMA task allocation for llama.cpp.
|
||||||
|
--extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
|
||||||
|
|
||||||
Transformers/Accelerate:
|
Transformers/Accelerate:
|
||||||
--cpu Use the CPU to generate text. Warning: Training on CPU is extremely slow.
|
--cpu Use the CPU to generate text. Warning: Training on CPU is extremely slow.
|
||||||
--cpu-memory CPU_MEMORY Maximum CPU memory in GiB. Use this for CPU offloading.
|
--cpu-memory CPU_MEMORY Maximum CPU memory in GiB. Use this for CPU offloading.
|
||||||
|
|
@ -292,34 +319,6 @@ bitsandbytes 4-bit:
|
||||||
--compute_dtype COMPUTE_DTYPE compute dtype for 4-bit. Valid options: bfloat16, float16, float32.
|
--compute_dtype COMPUTE_DTYPE compute dtype for 4-bit. Valid options: bfloat16, float16, float32.
|
||||||
--quant_type QUANT_TYPE quant_type for 4-bit. Valid options: nf4, fp4.
|
--quant_type QUANT_TYPE quant_type for 4-bit. Valid options: nf4, fp4.
|
||||||
|
|
||||||
llama.cpp:
|
|
||||||
--flash-attn Use flash-attention.
|
|
||||||
--threads THREADS Number of threads to use.
|
|
||||||
--threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing.
|
|
||||||
--batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval.
|
|
||||||
--no-mmap Prevent mmap from being used.
|
|
||||||
--mlock Force the system to keep the model in RAM.
|
|
||||||
--gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU.
|
|
||||||
--tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
|
|
||||||
--numa Activate NUMA task allocation for llama.cpp.
|
|
||||||
--no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
|
|
||||||
--row-split Split the model by rows across GPUs. This may improve multi-gpu performance.
|
|
||||||
--extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
|
|
||||||
--streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
|
|
||||||
--mmproj MMPROJ Path to the mmproj file for vision models.
|
|
||||||
|
|
||||||
Context and cache:
|
|
||||||
--ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens.
|
|
||||||
--cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits
|
|
||||||
separately, e.g. q4_q8).
|
|
||||||
|
|
||||||
Speculative decoding:
|
|
||||||
--model-draft MODEL_DRAFT Path to the draft model for speculative decoding.
|
|
||||||
--draft-max DRAFT_MAX Number of tokens to draft for speculative decoding.
|
|
||||||
--gpu-layers-draft GPU_LAYERS_DRAFT Number of layers to offload to the GPU for the draft model.
|
|
||||||
--device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1
|
|
||||||
--ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model.
|
|
||||||
|
|
||||||
ExLlamaV3:
|
ExLlamaV3:
|
||||||
--enable-tp, --enable_tp Enable Tensor Parallelism (TP) to split the model across GPUs.
|
--enable-tp, --enable_tp Enable Tensor Parallelism (TP) to split the model across GPUs.
|
||||||
--tp-backend TP_BACKEND The backend for tensor parallelism. Valid options: native, nccl. Default: native.
|
--tp-backend TP_BACKEND The backend for tensor parallelism. Valid options: native, nccl. Default: native.
|
||||||
|
|
|
||||||
23
css/main.css
23
css/main.css
|
|
@ -22,10 +22,6 @@
|
||||||
font-style: italic;
|
font-style: italic;
|
||||||
}
|
}
|
||||||
|
|
||||||
.tabs.svelte-710i53 {
|
|
||||||
margin-top: 0
|
|
||||||
}
|
|
||||||
|
|
||||||
.padded.svelte-12cmxck {
|
.padded.svelte-12cmxck {
|
||||||
padding: 3px 0;
|
padding: 3px 0;
|
||||||
}
|
}
|
||||||
|
|
@ -126,10 +122,6 @@ span.math.inline {
|
||||||
vertical-align: baseline !important;
|
vertical-align: baseline !important;
|
||||||
}
|
}
|
||||||
|
|
||||||
div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
|
|
||||||
flex-wrap: nowrap;
|
|
||||||
}
|
|
||||||
|
|
||||||
gradio-app > :first-child {
|
gradio-app > :first-child {
|
||||||
padding: 0 !important;
|
padding: 0 !important;
|
||||||
}
|
}
|
||||||
|
|
@ -325,7 +317,7 @@ audio {
|
||||||
/* ----------------------------------------------
|
/* ----------------------------------------------
|
||||||
Chat tab
|
Chat tab
|
||||||
---------------------------------------------- */
|
---------------------------------------------- */
|
||||||
.h-\[40dvh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx {
|
.h-\[40dvh\] {
|
||||||
height: 66.67dvh
|
height: 66.67dvh
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -338,23 +330,10 @@ audio {
|
||||||
width: unset
|
width: unset
|
||||||
}
|
}
|
||||||
|
|
||||||
div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
|
||||||
flex-wrap: nowrap
|
|
||||||
}
|
|
||||||
|
|
||||||
.pending.svelte-1ed2p3z {
|
.pending.svelte-1ed2p3z {
|
||||||
opacity: 1;
|
opacity: 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
.wrap.svelte-6roggh.svelte-6roggh {
|
|
||||||
max-height: 92.5%;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* This is for the microphone button in the whisper extension */
|
|
||||||
.sm.svelte-1ipelgc {
|
|
||||||
width: 100%;
|
|
||||||
}
|
|
||||||
|
|
||||||
#chat-tab {
|
#chat-tab {
|
||||||
padding: 0;
|
padding: 0;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -242,9 +242,19 @@ class ModelDownloader:
|
||||||
try:
|
try:
|
||||||
if output_path.exists() and not start_from_scratch:
|
if output_path.exists() and not start_from_scratch:
|
||||||
current_file_size_on_disk = output_path.stat().st_size
|
current_file_size_on_disk = output_path.stat().st_size
|
||||||
r_head = session.head(url, timeout=20)
|
|
||||||
r_head.raise_for_status()
|
# Make a HEAD request without following redirects to get metadata first
|
||||||
total_size = int(r_head.headers.get('content-length', 0))
|
r_head = session.head(url, timeout=20, allow_redirects=True)
|
||||||
|
r_head.raise_for_status() # Will raise an error for 4xx or 5xx status codes
|
||||||
|
|
||||||
|
# Check for the new 'x-linked-size' header from Hugging Face
|
||||||
|
if 'x-linked-size' in r_head.headers:
|
||||||
|
total_size = int(r_head.headers['x-linked-size'])
|
||||||
|
# Fallback to the old 'content-length' just in case
|
||||||
|
elif 'content-length' in r_head.headers:
|
||||||
|
total_size = int(r_head.headers.get('content-length', 0))
|
||||||
|
else:
|
||||||
|
total_size = 0
|
||||||
|
|
||||||
if current_file_size_on_disk >= total_size and total_size > 0:
|
if current_file_size_on_disk >= total_size and total_size > 0:
|
||||||
if self.progress_queue is not None and total_size > 0:
|
if self.progress_queue is not None and total_size > 0:
|
||||||
|
|
|
||||||
|
|
@ -880,7 +880,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
|
||||||
|
|
||||||
# Extract the reply
|
# Extract the reply
|
||||||
if state['mode'] in ['chat', 'chat-instruct']:
|
if state['mode'] in ['chat', 'chat-instruct']:
|
||||||
reply = reply.lstrip()
|
if not _continue:
|
||||||
|
reply = reply.lstrip()
|
||||||
|
|
||||||
if reply.startswith(state['name2'] + ':'):
|
if reply.startswith(state['name2'] + ':'):
|
||||||
reply = reply[len(state['name2'] + ':'):]
|
reply = reply[len(state['name2'] + ':'):]
|
||||||
elif reply.startswith(state['name1'] + ':'):
|
elif reply.startswith(state['name1'] + ':'):
|
||||||
|
|
@ -1831,6 +1833,10 @@ def handle_branch_chat_click(state):
|
||||||
history = state['history']
|
history = state['history']
|
||||||
history['visible'] = history['visible'][:branch_from_index + 1]
|
history['visible'] = history['visible'][:branch_from_index + 1]
|
||||||
history['internal'] = history['internal'][:branch_from_index + 1]
|
history['internal'] = history['internal'][:branch_from_index + 1]
|
||||||
|
# Prune the metadata dictionary to remove entries beyond the branch point
|
||||||
|
if 'metadata' in history:
|
||||||
|
history['metadata'] = {k: v for k, v in history['metadata'].items() if int(k.split('_')[-1]) <= branch_from_index}
|
||||||
|
|
||||||
new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
|
new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
|
||||||
save_history(history, new_unique_id, state['character_menu'], state['mode'])
|
save_history(history, new_unique_id, state['character_menu'], state['mode'])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,6 @@ from pathlib import Path
|
||||||
from typing import Any, Dict, Optional, Union
|
from typing import Any, Dict, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from exllamav3 import Cache, Config, Model
|
|
||||||
from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
|
|
||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from transformers import (
|
from transformers import (
|
||||||
GenerationConfig,
|
GenerationConfig,
|
||||||
|
|
@ -15,6 +13,8 @@ from transformers import (
|
||||||
)
|
)
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
|
|
||||||
|
from exllamav3 import Cache, Config, Model
|
||||||
|
from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
|
||||||
from modules import shared
|
from modules import shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
|
|
||||||
|
|
@ -103,12 +103,6 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
|
||||||
labels = kwargs.get('labels', None)
|
labels = kwargs.get('labels', None)
|
||||||
past_key_values = kwargs.get('past_key_values', None)
|
past_key_values = kwargs.get('past_key_values', None)
|
||||||
|
|
||||||
# Reset the internal sequence state for standalone calls (logit viewer)
|
|
||||||
# or the very first step of a new generation.
|
|
||||||
if past_key_values is None:
|
|
||||||
self.past_seq = None
|
|
||||||
self.past_seq_negative = None
|
|
||||||
|
|
||||||
if len(args) > 0:
|
if len(args) > 0:
|
||||||
if not shared.args.cfg_cache:
|
if not shared.args.cfg_cache:
|
||||||
logger.error("Please enable the cfg-cache option to use CFG with ExLlamav3_HF.")
|
logger.error("Please enable the cfg-cache option to use CFG with ExLlamav3_HF.")
|
||||||
|
|
@ -125,8 +119,8 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
|
||||||
ex_cache = self.ex_cache
|
ex_cache = self.ex_cache
|
||||||
|
|
||||||
seq = input_ids[0].tolist()
|
seq = input_ids[0].tolist()
|
||||||
if is_negative and past_key_values is not None and isinstance(past_key_values, list):
|
if is_negative and past_key_values is not None:
|
||||||
seq = past_key_values + seq
|
seq = past_key_values + seq
|
||||||
|
|
||||||
seq_tensor = torch.tensor(seq)
|
seq_tensor = torch.tensor(seq)
|
||||||
reset = True
|
reset = True
|
||||||
|
|
@ -134,50 +128,97 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
|
||||||
# Maximum number of tokens to process in a single forward pass
|
# Maximum number of tokens to process in a single forward pass
|
||||||
max_chunk_size = 256
|
max_chunk_size = 256
|
||||||
|
|
||||||
if past_seq is not None:
|
|
||||||
min_length = min(past_seq.shape[0], seq_tensor.shape[0])
|
|
||||||
indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
|
|
||||||
if len(indices) == 0 and seq_tensor.shape[0] > past_seq.shape[0]:
|
|
||||||
reset = False
|
|
||||||
|
|
||||||
# Create a single `params` dictionary that will be used and modified
|
|
||||||
# in-place across all `forward` calls within this function.
|
|
||||||
params = {
|
|
||||||
"attn_mode": "flash_attn",
|
|
||||||
"cache": ex_cache,
|
|
||||||
"batch_shape": (1, self.max_tokens),
|
|
||||||
"reconstruct": False,
|
|
||||||
"past_len": 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# Make the forward call
|
# Make the forward call
|
||||||
if labels is None:
|
if labels is None:
|
||||||
# If it's an efficient continuation, process only the new tokens
|
if past_seq is not None:
|
||||||
if not reset:
|
min_length = min(past_seq.shape[0], seq_tensor.shape[0])
|
||||||
params["past_len"] = past_seq.shape[0]
|
indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
|
||||||
tokens_to_process = seq_tensor[past_seq.shape[0]:]
|
if len(indices) > 0:
|
||||||
# Otherwise, process the whole sequence from scratch
|
longest_prefix = indices[0].item()
|
||||||
else:
|
else:
|
||||||
tokens_to_process = seq_tensor
|
longest_prefix = min_length
|
||||||
|
|
||||||
# Process all but the last token of the sequence/sub-sequence
|
if longest_prefix > 0:
|
||||||
if tokens_to_process.shape[0] > 1:
|
reset = False
|
||||||
prefix_to_process = tokens_to_process[:-1]
|
current_len = longest_prefix
|
||||||
|
remaining_tokens = len(seq_tensor) - longest_prefix - 1
|
||||||
|
|
||||||
# Process in chunks if the number of tokens is large
|
if remaining_tokens > 0:
|
||||||
for i in range(0, prefix_to_process.shape[0], max_chunk_size):
|
# Process tokens from longest_prefix to second-to-last token
|
||||||
chunk = prefix_to_process[i:i + max_chunk_size]
|
tokens_to_process = seq_tensor[longest_prefix:-1]
|
||||||
self.ex_model.forward(input_ids=chunk.view(1, -1), params=params)
|
|
||||||
params["past_len"] += chunk.shape[0]
|
|
||||||
|
|
||||||
# Process the last token to get logits
|
# Process in chunks if the number of tokens is large
|
||||||
last_token = tokens_to_process[-1:].view(1, -1)
|
for i in range(0, tokens_to_process.shape[0], max_chunk_size):
|
||||||
logits = self.ex_model.forward(input_ids=last_token, params=params).to(input_ids.device).float()
|
chunk = tokens_to_process[i:i + max_chunk_size]
|
||||||
|
self.ex_model.forward(
|
||||||
|
input_ids=chunk.view(1, -1),
|
||||||
|
params={
|
||||||
|
"attn_mode": "flash_attn",
|
||||||
|
"cache": ex_cache,
|
||||||
|
"past_len": longest_prefix + i,
|
||||||
|
"batch_shape": (1, self.max_tokens),
|
||||||
|
"reconstruct": False # Force memory-efficient path
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
current_len = longest_prefix + remaining_tokens
|
||||||
|
|
||||||
|
if reset:
|
||||||
|
if len(seq_tensor) > 1:
|
||||||
|
# Process all tokens except the last one
|
||||||
|
tokens_to_process = seq_tensor[:-1]
|
||||||
|
|
||||||
|
# Process in chunks if the number of tokens is large
|
||||||
|
current_len = 0
|
||||||
|
for i in range(0, tokens_to_process.shape[0], max_chunk_size):
|
||||||
|
chunk = tokens_to_process[i:i + max_chunk_size]
|
||||||
|
self.ex_model.forward(
|
||||||
|
input_ids=chunk.view(1, -1),
|
||||||
|
params={
|
||||||
|
"attn_mode": "flash_attn",
|
||||||
|
"cache": ex_cache,
|
||||||
|
"past_len": current_len,
|
||||||
|
"batch_shape": (1, self.max_tokens),
|
||||||
|
"reconstruct": False # Force memory-efficient path
|
||||||
|
}
|
||||||
|
)
|
||||||
|
current_len += chunk.shape[0]
|
||||||
|
else:
|
||||||
|
current_len = 0
|
||||||
|
|
||||||
|
# Process the last token and get logits
|
||||||
|
logits = self.ex_model.forward(
|
||||||
|
input_ids=seq_tensor[-1:].view(1, -1),
|
||||||
|
params={
|
||||||
|
"attn_mode": "flash_attn",
|
||||||
|
"cache": ex_cache,
|
||||||
|
"past_len": current_len,
|
||||||
|
"batch_shape": (1, self.max_tokens),
|
||||||
|
"reconstruct": False # Force memory-efficient path
|
||||||
|
}
|
||||||
|
).to(input_ids.device).float()
|
||||||
else:
|
else:
|
||||||
# When processing with labels, handle as a complete sequence
|
# When processing with labels, handle as a complete sequence
|
||||||
params["attn_mode"] = "flash_attn_nc"
|
# Process in chunks if the number of tokens is large
|
||||||
logits = self.ex_model.forward(input_ids=seq_tensor.view(1,-1), params=params).float()
|
tokens_to_process = seq_tensor
|
||||||
|
all_logits = None
|
||||||
|
|
||||||
|
for i in range(0, tokens_to_process.shape[0], max_chunk_size):
|
||||||
|
chunk = tokens_to_process[i:i + max_chunk_size]
|
||||||
|
chunk_logits = self.ex_model.forward(
|
||||||
|
input_ids=chunk.view(1, -1),
|
||||||
|
params={
|
||||||
|
"attn_mode": "flash_attn_nc", # No caching for training
|
||||||
|
"reconstruct": False # Force memory-efficient path
|
||||||
|
}
|
||||||
|
).float()
|
||||||
|
|
||||||
|
if all_logits is None:
|
||||||
|
all_logits = chunk_logits
|
||||||
|
else:
|
||||||
|
all_logits = torch.cat([all_logits, chunk_logits], dim=1)
|
||||||
|
|
||||||
|
logits = all_logits
|
||||||
|
|
||||||
if is_negative:
|
if is_negative:
|
||||||
self.past_seq_negative = seq_tensor
|
self.past_seq_negative = seq_tensor
|
||||||
|
|
|
||||||
|
|
@ -200,7 +200,10 @@ class LlamaServer:
|
||||||
# Make the generation request
|
# Make the generation request
|
||||||
response = self.session.post(url, json=payload, stream=True)
|
response = self.session.post(url, json=payload, stream=True)
|
||||||
try:
|
try:
|
||||||
response.raise_for_status() # Raise an exception for HTTP errors
|
if response.status_code == 400 and response.json()["error"]["type"] == "exceed_context_size_error":
|
||||||
|
logger.error("The request exceeds the available context size, try increasing it")
|
||||||
|
else:
|
||||||
|
response.raise_for_status() # Raise an exception for HTTP errors
|
||||||
|
|
||||||
full_text = ""
|
full_text = ""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,6 @@ loaders_and_params = OrderedDict({
|
||||||
'disk',
|
'disk',
|
||||||
'use_double_quant',
|
'use_double_quant',
|
||||||
'bf16',
|
'bf16',
|
||||||
'trust_remote_code',
|
|
||||||
'no_use_fast',
|
'no_use_fast',
|
||||||
],
|
],
|
||||||
'ExLlamav3_HF': [
|
'ExLlamav3_HF': [
|
||||||
|
|
@ -53,7 +52,6 @@ loaders_and_params = OrderedDict({
|
||||||
'cache_type',
|
'cache_type',
|
||||||
'gpu_split',
|
'gpu_split',
|
||||||
'cfg_cache',
|
'cfg_cache',
|
||||||
'trust_remote_code',
|
|
||||||
'no_use_fast',
|
'no_use_fast',
|
||||||
'enable_tp',
|
'enable_tp',
|
||||||
'tp_backend',
|
'tp_backend',
|
||||||
|
|
@ -82,7 +80,6 @@ loaders_and_params = OrderedDict({
|
||||||
'no_xformers',
|
'no_xformers',
|
||||||
'no_sdpa',
|
'no_sdpa',
|
||||||
'cfg_cache',
|
'cfg_cache',
|
||||||
'trust_remote_code',
|
|
||||||
'no_use_fast',
|
'no_use_fast',
|
||||||
],
|
],
|
||||||
'ExLlamav2': [
|
'ExLlamav2': [
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,35 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
|
||||||
group = parser.add_argument_group('Model loader')
|
group = parser.add_argument_group('Model loader')
|
||||||
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.')
|
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.')
|
||||||
|
|
||||||
|
# Cache
|
||||||
|
group = parser.add_argument_group('Context and cache')
|
||||||
|
group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
|
||||||
|
group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
|
||||||
|
|
||||||
|
# Speculative decoding
|
||||||
|
group = parser.add_argument_group('Speculative decoding')
|
||||||
|
group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.')
|
||||||
|
group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to draft for speculative decoding.')
|
||||||
|
group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of layers to offload to the GPU for the draft model.')
|
||||||
|
group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
|
||||||
|
group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
|
||||||
|
|
||||||
|
# llama.cpp
|
||||||
|
group = parser.add_argument_group('llama.cpp')
|
||||||
|
group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
|
||||||
|
group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.')
|
||||||
|
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
||||||
|
group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
|
||||||
|
group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
|
||||||
|
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
|
||||||
|
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
|
||||||
|
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
||||||
|
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
|
||||||
|
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
|
||||||
|
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
|
||||||
|
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
|
||||||
|
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"')
|
||||||
|
|
||||||
# Transformers/Accelerate
|
# Transformers/Accelerate
|
||||||
group = parser.add_argument_group('Transformers/Accelerate')
|
group = parser.add_argument_group('Transformers/Accelerate')
|
||||||
group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
|
group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
|
||||||
|
|
@ -71,35 +100,6 @@ group.add_argument('--use_double_quant', action='store_true', help='use_double_q
|
||||||
group.add_argument('--compute_dtype', type=str, default='float16', help='compute dtype for 4-bit. Valid options: bfloat16, float16, float32.')
|
group.add_argument('--compute_dtype', type=str, default='float16', help='compute dtype for 4-bit. Valid options: bfloat16, float16, float32.')
|
||||||
group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.')
|
group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.')
|
||||||
|
|
||||||
# llama.cpp
|
|
||||||
group = parser.add_argument_group('llama.cpp')
|
|
||||||
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
|
|
||||||
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
|
|
||||||
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
|
|
||||||
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
|
|
||||||
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
|
|
||||||
group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
|
|
||||||
group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
|
|
||||||
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
|
|
||||||
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
|
||||||
group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
|
|
||||||
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"')
|
|
||||||
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
|
||||||
group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.')
|
|
||||||
|
|
||||||
# Cache
|
|
||||||
group = parser.add_argument_group('Context and cache')
|
|
||||||
group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
|
|
||||||
group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
|
|
||||||
|
|
||||||
# Speculative decoding
|
|
||||||
group = parser.add_argument_group('Speculative decoding')
|
|
||||||
group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.')
|
|
||||||
group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to draft for speculative decoding.')
|
|
||||||
group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of layers to offload to the GPU for the draft model.')
|
|
||||||
group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
|
|
||||||
group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
|
|
||||||
|
|
||||||
# ExLlamaV3
|
# ExLlamaV3
|
||||||
group = parser.add_argument_group('ExLlamaV3')
|
group = parser.add_argument_group('ExLlamaV3')
|
||||||
group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) to split the model across GPUs.')
|
group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) to split the model across GPUs.')
|
||||||
|
|
@ -174,6 +174,7 @@ if cmd_flags_path.exists():
|
||||||
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
original_args = copy.deepcopy(args)
|
||||||
args_defaults = parser.parse_args([])
|
args_defaults = parser.parse_args([])
|
||||||
|
|
||||||
# Create a mapping of all argument aliases to their canonical names
|
# Create a mapping of all argument aliases to their canonical names
|
||||||
|
|
@ -295,7 +296,13 @@ default_settings = copy.deepcopy(settings)
|
||||||
def do_cmd_flags_warnings():
|
def do_cmd_flags_warnings():
|
||||||
# Security warnings
|
# Security warnings
|
||||||
if args.trust_remote_code:
|
if args.trust_remote_code:
|
||||||
logger.warning('trust_remote_code is enabled. This is dangerous.')
|
logger.warning(
|
||||||
|
"The `--trust-remote-code` flag is enabled.\n"
|
||||||
|
"This allows models to execute arbitrary code on your machine.\n\n"
|
||||||
|
"1. Only use with models from sources you fully trust.\n"
|
||||||
|
"2. Set an access password with `--gradio-auth`."
|
||||||
|
)
|
||||||
|
|
||||||
if 'COLAB_GPU' not in os.environ and not args.nowebui:
|
if 'COLAB_GPU' not in os.environ and not args.nowebui:
|
||||||
if args.share:
|
if args.share:
|
||||||
logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.")
|
logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.")
|
||||||
|
|
|
||||||
|
|
@ -123,7 +123,7 @@ def load_tokenizer(model_name, tokenizer_dir=None):
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
path_to_model,
|
path_to_model,
|
||||||
trust_remote_code=shared.args.trust_remote_code,
|
trust_remote_code=shared.original_args.trust_remote_code,
|
||||||
use_fast=not shared.args.no_use_fast
|
use_fast=not shared.args.no_use_fast
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -140,13 +140,13 @@ def load_model_HF(model_name):
|
||||||
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
|
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
|
||||||
}
|
}
|
||||||
|
|
||||||
if shared.args.trust_remote_code:
|
if shared.original_args.trust_remote_code:
|
||||||
params['trust_remote_code'] = True
|
params['trust_remote_code'] = True
|
||||||
|
|
||||||
if shared.args.force_safetensors:
|
if shared.args.force_safetensors:
|
||||||
params['force_safetensors'] = True
|
params['force_safetensors'] = True
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
|
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.original_args.trust_remote_code)
|
||||||
|
|
||||||
if 'chatglm' in model_name.lower():
|
if 'chatglm' in model_name.lower():
|
||||||
LoaderClass = AutoModel
|
LoaderClass = AutoModel
|
||||||
|
|
|
||||||
|
|
@ -160,7 +160,6 @@ def list_model_elements():
|
||||||
'no_sdpa',
|
'no_sdpa',
|
||||||
'cfg_cache',
|
'cfg_cache',
|
||||||
'cpp_runner',
|
'cpp_runner',
|
||||||
'trust_remote_code',
|
|
||||||
'no_use_fast',
|
'no_use_fast',
|
||||||
'model_draft',
|
'model_draft',
|
||||||
'draft_max',
|
'draft_max',
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,6 @@ def create_ui():
|
||||||
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
|
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
|
||||||
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).')
|
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).')
|
||||||
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
|
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
|
||||||
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
|
|
||||||
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
|
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
|
||||||
|
|
||||||
# Multimodal
|
# Multimodal
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@ datasets
|
||||||
einops
|
einops
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
flash-linear-attention==0.3.2
|
flash-linear-attention==0.3.2
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
|
|
@ -15,7 +14,7 @@ pandas
|
||||||
peft==0.17.*
|
peft==0.17.*
|
||||||
Pillow>=9.5.0
|
Pillow>=9.5.0
|
||||||
psutil
|
psutil
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -30,16 +29,20 @@ triton-windows==3.3.1.post19; platform_system == "Windows"
|
||||||
tqdm
|
tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.10/exllamav3-0.0.10+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.10/exllamav3-0.0.10+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ colorama
|
||||||
datasets
|
datasets
|
||||||
einops
|
einops
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
|
|
@ -13,7 +12,7 @@ pandas
|
||||||
peft==0.17.*
|
peft==0.17.*
|
||||||
Pillow>=9.5.0
|
Pillow>=9.5.0
|
||||||
psutil
|
psutil
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -28,13 +27,17 @@ triton-windows==3.2.0.post19; platform_system == "Windows"
|
||||||
tqdm
|
tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# AMD wheels
|
# AMD wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ colorama
|
||||||
datasets
|
datasets
|
||||||
einops
|
einops
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
|
|
@ -13,7 +12,7 @@ pandas
|
||||||
peft==0.17.*
|
peft==0.17.*
|
||||||
Pillow>=9.5.0
|
Pillow>=9.5.0
|
||||||
psutil
|
psutil
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -28,13 +27,17 @@ triton-windows==3.2.0.post19; platform_system == "Windows"
|
||||||
tqdm
|
tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# AMD wheels
|
# AMD wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ colorama
|
||||||
datasets
|
datasets
|
||||||
einops
|
einops
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
|
|
@ -13,7 +12,7 @@ pandas
|
||||||
peft==0.17.*
|
peft==0.17.*
|
||||||
Pillow>=9.5.0
|
Pillow>=9.5.0
|
||||||
psutil
|
psutil
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -28,11 +27,15 @@ triton-windows==3.2.0.post19; platform_system == "Windows"
|
||||||
tqdm
|
tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# Mac wheels
|
# Mac wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ colorama
|
||||||
datasets
|
datasets
|
||||||
einops
|
einops
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
|
|
@ -13,7 +12,7 @@ pandas
|
||||||
peft==0.17.*
|
peft==0.17.*
|
||||||
Pillow>=9.5.0
|
Pillow>=9.5.0
|
||||||
psutil
|
psutil
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -28,12 +27,16 @@ triton-windows==3.2.0.post19; platform_system == "Windows"
|
||||||
tqdm
|
tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# Mac wheels
|
# Mac wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ colorama
|
||||||
datasets
|
datasets
|
||||||
einops
|
einops
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
|
|
@ -13,7 +12,7 @@ pandas
|
||||||
peft==0.17.*
|
peft==0.17.*
|
||||||
Pillow>=9.5.0
|
Pillow>=9.5.0
|
||||||
psutil
|
psutil
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -28,11 +27,15 @@ triton-windows==3.2.0.post19; platform_system == "Windows"
|
||||||
tqdm
|
tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# llama.cpp (CPU only, AVX2)
|
# llama.cpp (CPU only, AVX2)
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ colorama
|
||||||
datasets
|
datasets
|
||||||
einops
|
einops
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
|
|
@ -13,7 +12,7 @@ pandas
|
||||||
peft==0.17.*
|
peft==0.17.*
|
||||||
Pillow>=9.5.0
|
Pillow>=9.5.0
|
||||||
psutil
|
psutil
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -28,11 +27,15 @@ triton-windows==3.2.0.post19; platform_system == "Windows"
|
||||||
tqdm
|
tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# llama.cpp (CPU only, no AVX2)
|
# llama.cpp (CPU only, no AVX2)
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@ datasets
|
||||||
einops
|
einops
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
flash-linear-attention==0.3.2
|
flash-linear-attention==0.3.2
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
|
|
@ -15,7 +14,7 @@ pandas
|
||||||
peft==0.17.*
|
peft==0.17.*
|
||||||
Pillow>=9.5.0
|
Pillow>=9.5.0
|
||||||
psutil
|
psutil
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -30,16 +29,20 @@ triton-windows==3.3.1.post19; platform_system == "Windows"
|
||||||
tqdm
|
tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.10/exllamav3-0.0.10+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.10/exllamav3-0.0.10+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ colorama
|
||||||
datasets
|
datasets
|
||||||
einops
|
einops
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
|
|
@ -13,7 +12,7 @@ pandas
|
||||||
peft==0.17.*
|
peft==0.17.*
|
||||||
Pillow>=9.5.0
|
Pillow>=9.5.0
|
||||||
psutil
|
psutil
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -28,6 +27,10 @@ triton-windows==3.2.0.post19; platform_system == "Windows"
|
||||||
tqdm
|
tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
audioop-lts<1.0; python_version >= "3.13"
|
audioop-lts<1.0; python_version >= "3.13"
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numpy==2.2.*
|
numpy==2.2.*
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -13,11 +12,15 @@ requests
|
||||||
rich
|
rich
|
||||||
tqdm
|
tqdm
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
audioop-lts<1.0; python_version >= "3.13"
|
audioop-lts<1.0; python_version >= "3.13"
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numpy==2.2.*
|
numpy==2.2.*
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -13,12 +12,16 @@ requests
|
||||||
rich
|
rich
|
||||||
tqdm
|
tqdm
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# Mac wheels
|
# Mac wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
audioop-lts<1.0; python_version >= "3.13"
|
audioop-lts<1.0; python_version >= "3.13"
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numpy==2.2.*
|
numpy==2.2.*
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -13,12 +12,16 @@ requests
|
||||||
rich
|
rich
|
||||||
tqdm
|
tqdm
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# Mac wheels
|
# Mac wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
audioop-lts<1.0; python_version >= "3.13"
|
audioop-lts<1.0; python_version >= "3.13"
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numpy==2.2.*
|
numpy==2.2.*
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -13,11 +12,15 @@ requests
|
||||||
rich
|
rich
|
||||||
tqdm
|
tqdm
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# llama.cpp (CPU only, AVX2)
|
# llama.cpp (CPU only, AVX2)
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
audioop-lts<1.0; python_version >= "3.13"
|
audioop-lts<1.0; python_version >= "3.13"
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numpy==2.2.*
|
numpy==2.2.*
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -13,11 +12,15 @@ requests
|
||||||
rich
|
rich
|
||||||
tqdm
|
tqdm
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# llama.cpp (CPU only, no AVX2)
|
# llama.cpp (CPU only, no AVX2)
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
audioop-lts<1.0; python_version >= "3.13"
|
audioop-lts<1.0; python_version >= "3.13"
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numpy==2.2.*
|
numpy==2.2.*
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -13,11 +12,15 @@ requests
|
||||||
rich
|
rich
|
||||||
tqdm
|
tqdm
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
audioop-lts<1.0; python_version >= "3.13"
|
audioop-lts<1.0; python_version >= "3.13"
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numpy==2.2.*
|
numpy==2.2.*
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -13,6 +12,10 @@ requests
|
||||||
rich
|
rich
|
||||||
tqdm
|
tqdm
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
audioop-lts<1.0; python_version >= "3.13"
|
audioop-lts<1.0; python_version >= "3.13"
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numpy==2.2.*
|
numpy==2.2.*
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -13,11 +12,15 @@ requests
|
||||||
rich
|
rich
|
||||||
tqdm
|
tqdm
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
audioop-lts<1.0; python_version >= "3.13"
|
audioop-lts<1.0; python_version >= "3.13"
|
||||||
fastapi==0.112.4
|
fastapi==0.112.4
|
||||||
gradio==4.37.*
|
|
||||||
html2text==2025.4.15
|
html2text==2025.4.15
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numpy==2.2.*
|
numpy==2.2.*
|
||||||
pydantic==2.8.2
|
pydantic==2.11.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-docx==1.1.2
|
python-docx==1.1.2
|
||||||
pyyaml
|
pyyaml
|
||||||
|
|
@ -13,11 +12,15 @@ requests
|
||||||
rich
|
rich
|
||||||
tqdm
|
tqdm
|
||||||
|
|
||||||
|
# Gradio
|
||||||
|
gradio==4.37.*
|
||||||
|
https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
|
||||||
|
|
||||||
# API
|
# API
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.49.0/llama_cpp_binaries-0.49.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.53.0/llama_cpp_binaries-0.53.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue