mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-12-06 07:12:10 +01:00
Compare commits
15 commits
e42f656187
...
ac42d48774
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ac42d48774 | ||
|
|
bd9f2de73a | ||
|
|
661e42d2b7 | ||
|
|
5327bc9397 | ||
|
|
78b315344a | ||
|
|
3cad0cd4c1 | ||
|
|
400bb0694b | ||
|
|
8f0048663d | ||
|
|
b0baf7518b | ||
|
|
3f1f0f0f7f | ||
|
|
297fd7a67a | ||
|
|
fe0bef40d2 | ||
|
|
10947b3e53 | ||
|
|
25c8f1fda3 | ||
|
|
365a997a7f |
2
.github/workflows/build-portable-release.yml
vendored
2
.github/workflows/build-portable-release.yml
vendored
|
|
@ -57,7 +57,7 @@ jobs:
|
|||
id: set-matrix
|
||||
run: |
|
||||
$matrix = @{
|
||||
'os' = @('ubuntu-22.04', 'windows-2022', 'macos-13', 'macos-14')
|
||||
'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
|
||||
'pyver' = @("3.11")
|
||||
'avx' = @("AVX2")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
coqui-tts==0.25.1
|
||||
coqui-tts>=0.27.0
|
||||
|
|
|
|||
|
|
@ -196,50 +196,45 @@ def extract_thinking_block(string):
|
|||
return None, string
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def convert_to_markdown(string, message_id=None):
|
||||
if not string:
|
||||
def build_thinking_block(thinking_content, message_id, has_remaining_content):
|
||||
"""Build HTML for a thinking block."""
|
||||
if thinking_content is None:
|
||||
return None
|
||||
|
||||
# Process the thinking content through markdown
|
||||
thinking_html = process_markdown_content(thinking_content)
|
||||
|
||||
# Generate unique ID for the thinking block
|
||||
block_id = f"thinking-{message_id}-0"
|
||||
|
||||
# Check if thinking is complete or still in progress
|
||||
is_streaming = not has_remaining_content
|
||||
title_text = "Thinking..." if is_streaming else "Thought"
|
||||
|
||||
return f'''
|
||||
<details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
|
||||
<summary class="thinking-header">
|
||||
{info_svg_small}
|
||||
<span class="thinking-title">{title_text}</span>
|
||||
</summary>
|
||||
<div class="thinking-content pretty_scrollbar">{thinking_html}</div>
|
||||
</details>
|
||||
'''
|
||||
|
||||
|
||||
def build_main_content_block(content):
|
||||
"""Build HTML for the main content block."""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
# Use a default message ID if none provided
|
||||
if message_id is None:
|
||||
message_id = "unknown"
|
||||
|
||||
# Extract thinking block if present
|
||||
thinking_content, remaining_content = extract_thinking_block(string)
|
||||
|
||||
# Process the main content
|
||||
html_output = process_markdown_content(remaining_content)
|
||||
|
||||
# If thinking content was found, process it using the same function
|
||||
if thinking_content is not None:
|
||||
thinking_html = process_markdown_content(thinking_content)
|
||||
|
||||
# Generate unique ID for the thinking block
|
||||
block_id = f"thinking-{message_id}-0"
|
||||
|
||||
# Check if thinking is complete or still in progress
|
||||
is_streaming = not remaining_content
|
||||
title_text = "Thinking..." if is_streaming else "Thought"
|
||||
|
||||
thinking_block = f'''
|
||||
<details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
|
||||
<summary class="thinking-header">
|
||||
{info_svg_small}
|
||||
<span class="thinking-title">{title_text}</span>
|
||||
</summary>
|
||||
<div class="thinking-content pretty_scrollbar">{thinking_html}</div>
|
||||
</details>
|
||||
'''
|
||||
|
||||
# Prepend the thinking block to the message HTML
|
||||
html_output = thinking_block + html_output
|
||||
|
||||
return html_output
|
||||
return process_markdown_content(content)
|
||||
|
||||
|
||||
def process_markdown_content(string):
|
||||
"""Process a string through the markdown conversion pipeline."""
|
||||
"""
|
||||
Process a string through the markdown conversion pipeline.
|
||||
Uses robust manual parsing to ensure correct LaTeX and Code Block rendering.
|
||||
"""
|
||||
if not string:
|
||||
return ""
|
||||
|
||||
|
|
@ -280,7 +275,7 @@ def process_markdown_content(string):
|
|||
pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
|
||||
string = pattern.sub(replace_blockquote, string)
|
||||
|
||||
# Code
|
||||
# Code block standardization
|
||||
string = string.replace('\\begin{code}', '```')
|
||||
string = string.replace('\\end{code}', '```')
|
||||
string = string.replace('\\begin{align*}', '$$')
|
||||
|
|
@ -301,6 +296,7 @@ def process_markdown_content(string):
|
|||
is_code = False
|
||||
is_latex = False
|
||||
|
||||
# Manual line iteration for robust structure parsing
|
||||
for line in string.split('\n'):
|
||||
stripped_line = line.strip()
|
||||
|
||||
|
|
@ -371,6 +367,39 @@ def process_markdown_content(string):
|
|||
return html_output
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def convert_to_markdown(string, message_id=None):
|
||||
"""
|
||||
Convert a string to markdown HTML with support for multiple block types.
|
||||
Blocks are assembled in order: thinking, main content, etc.
|
||||
"""
|
||||
if not string:
|
||||
return ""
|
||||
|
||||
# Use a default message ID if none provided
|
||||
if message_id is None:
|
||||
message_id = "unknown"
|
||||
|
||||
# Extract different components from the string
|
||||
thinking_content, remaining_content = extract_thinking_block(string)
|
||||
|
||||
# Build individual HTML blocks
|
||||
blocks = []
|
||||
|
||||
# Add thinking block if present
|
||||
thinking_html = build_thinking_block(thinking_content, message_id, bool(remaining_content))
|
||||
if thinking_html:
|
||||
blocks.append(thinking_html)
|
||||
|
||||
# Add main content block
|
||||
main_html = build_main_content_block(remaining_content)
|
||||
if main_html:
|
||||
blocks.append(main_html)
|
||||
|
||||
# Assemble all blocks into final HTML
|
||||
return ''.join(blocks)
|
||||
|
||||
|
||||
def convert_to_markdown_wrapped(string, message_id=None, use_cache=True):
|
||||
'''
|
||||
Used to avoid caching convert_to_markdown calls during streaming.
|
||||
|
|
|
|||
|
|
@ -317,6 +317,7 @@ class LlamaServer:
|
|||
"--ctx-size", str(shared.args.ctx_size),
|
||||
"--gpu-layers", str(shared.args.gpu_layers),
|
||||
"--batch-size", str(shared.args.batch_size),
|
||||
"--ubatch-size", str(shared.args.ubatch_size),
|
||||
"--port", str(self.port),
|
||||
"--no-webui",
|
||||
"--flash-attn", "on",
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ loaders_and_params = OrderedDict({
|
|||
'threads',
|
||||
'threads_batch',
|
||||
'batch_size',
|
||||
'ubatch_size',
|
||||
'ctx_size',
|
||||
'cache_type',
|
||||
'tensor_split',
|
||||
|
|
@ -104,6 +105,9 @@ loaders_and_params = OrderedDict({
|
|||
'ctx_size',
|
||||
'cpp_runner',
|
||||
'tensorrt_llm_info',
|
||||
],
|
||||
'MLX': [
|
||||
'ctx_size',
|
||||
]
|
||||
})
|
||||
|
||||
|
|
@ -358,6 +362,26 @@ loaders_samplers = {
|
|||
'presence_penalty',
|
||||
'auto_max_new_tokens',
|
||||
'ban_eos_token',
|
||||
},
|
||||
'MLX': {
|
||||
'temperature',
|
||||
'dynatemp_low',
|
||||
'dynatemp_high',
|
||||
'dynatemp_exponent',
|
||||
'top_p',
|
||||
'top_k',
|
||||
'min_p',
|
||||
'xtc_threshold',
|
||||
'xtc_probability',
|
||||
'repetition_penalty',
|
||||
'repetition_penalty_range',
|
||||
'auto_max_new_tokens',
|
||||
'ban_eos_token',
|
||||
'add_bos_token',
|
||||
'skip_special_tokens',
|
||||
'dynamic_temperature',
|
||||
'seed',
|
||||
'sampler_priority',
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
371
modules/mlx_loader.py
Normal file
371
modules/mlx_loader.py
Normal file
|
|
@ -0,0 +1,371 @@
|
|||
import platform
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
import modules.shared as shared
|
||||
from modules.logging_colors import logger
|
||||
|
||||
# Constants for MLX configuration
|
||||
MLX_TOP_P_DISABLED = 0.0 # MLX expects 0.0 to disable top_p
|
||||
DEFAULT_MAX_TOKENS = 512 # Default maximum tokens for generation
|
||||
|
||||
|
||||
def is_apple_silicon():
|
||||
"""Check if running on Apple Silicon"""
|
||||
return platform.system() == "Darwin" and platform.machine() == "arm64"
|
||||
|
||||
|
||||
class MLXModel:
|
||||
def __init__(self):
|
||||
self.model = None
|
||||
self.tokenizer = None
|
||||
self.model_name = None
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, model_name):
|
||||
"""Load MLX model from path or HuggingFace repository"""
|
||||
|
||||
if not is_apple_silicon():
|
||||
logger.warning("MLX backend is only supported on Apple Silicon. Falling back to Transformers.")
|
||||
return None
|
||||
|
||||
try:
|
||||
from mlx_lm import load
|
||||
except ImportError:
|
||||
logger.error("mlx-lm not found. Please install with: pip install mlx-lm")
|
||||
return None
|
||||
|
||||
instance = cls()
|
||||
instance.model_name = model_name
|
||||
|
||||
try:
|
||||
# Determine the model path/name
|
||||
model_path = cls._resolve_model_path(model_name)
|
||||
|
||||
logger.info(f"Loading MLX model: {model_path}")
|
||||
tokenizer_config = {"trust_remote_code": True}
|
||||
model, tokenizer = load(model_path, tokenizer_config=tokenizer_config)
|
||||
|
||||
instance.model = model
|
||||
instance.tokenizer = tokenizer
|
||||
|
||||
logger.info(f"Successfully loaded MLX model: {model_name}")
|
||||
return instance # Return instance for compatibility
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "not supported" in error_msg.lower():
|
||||
logger.error(f"MLX model {model_name} uses an unsupported model type: {error_msg}")
|
||||
logger.error("Consider using a different loader or updating mlx-lm to a newer version")
|
||||
else:
|
||||
logger.error(f"Failed to load MLX model {model_name}: {error_msg}")
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _resolve_model_path(model_name):
|
||||
"""Resolve model path - either local path or HuggingFace repo"""
|
||||
model_path = Path(f'{shared.args.model_dir}/{model_name}')
|
||||
|
||||
if model_path.exists():
|
||||
# Local model path
|
||||
return str(model_path)
|
||||
elif '/' in model_name:
|
||||
# Already has repo/model format
|
||||
return model_name
|
||||
elif '_' in model_name and not model_name.startswith('_'):
|
||||
# Handle repo_name format - convert first underscore to slash
|
||||
# e.g., "mlx-community_model-name" -> "mlx-community/model-name"
|
||||
parts = model_name.split('_', 1)
|
||||
if len(parts) == 2:
|
||||
return f"{parts[0]}/{parts[1]}"
|
||||
return model_name
|
||||
else:
|
||||
# Default to mlx-community for standalone model names
|
||||
return f"mlx-community/{model_name}"
|
||||
|
||||
def _create_mlx_sampler(self, state):
|
||||
"""Create MLX sampler with webui parameters"""
|
||||
try:
|
||||
from mlx_lm.sample_utils import make_sampler
|
||||
|
||||
# Extract sampling parameters from state
|
||||
temperature = state.get('temperature', 1.0)
|
||||
top_p = state.get('top_p', 1.0)
|
||||
top_k = state.get('top_k', 0) # 0 means no top_k limit
|
||||
min_p = state.get('min_p', 0.0)
|
||||
|
||||
# Handle dynamic temperature
|
||||
if state.get('dynamic_temperature', False):
|
||||
temp_low = state.get('dynatemp_low', 1.0)
|
||||
temp_high = state.get('dynatemp_high', 1.0)
|
||||
temperature = (temp_low + temp_high) / 2 # Simple average for now
|
||||
|
||||
# XTC sampling parameters
|
||||
xtc_probability = state.get('xtc_probability', 0.0)
|
||||
xtc_threshold = state.get('xtc_threshold', 0.1)
|
||||
|
||||
# Ensure temperature is not zero (causes issues with MLX)
|
||||
if temperature <= 0.0:
|
||||
temperature = 0.01
|
||||
|
||||
# Log sampling parameters for debugging
|
||||
if shared.args.verbose:
|
||||
logger.info(f"MLX Sampler - temp: {temperature}, top_p: {top_p}, top_k: {top_k}, min_p: {min_p}")
|
||||
|
||||
# Create the sampler
|
||||
sampler = make_sampler(
|
||||
temp=temperature,
|
||||
top_p=top_p if top_p < 1.0 else MLX_TOP_P_DISABLED, # MLX expects 0.0 to disable
|
||||
top_k=int(top_k) if top_k > 0 else 0,
|
||||
min_p=min_p,
|
||||
min_tokens_to_keep=1, # Always keep at least one token
|
||||
xtc_probability=xtc_probability,
|
||||
xtc_threshold=xtc_threshold,
|
||||
xtc_special_tokens=[] # Could be customized later
|
||||
)
|
||||
|
||||
return sampler
|
||||
|
||||
except ImportError:
|
||||
logger.warning("MLX sampling utilities not available, using default sampler")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create MLX sampler: {e}")
|
||||
return None
|
||||
|
||||
def _create_logits_processors(self, state):
|
||||
"""Create logits processors for repetition penalty, etc."""
|
||||
processors = []
|
||||
|
||||
try:
|
||||
from mlx_lm.sample_utils import make_repetition_penalty
|
||||
|
||||
# Repetition penalty
|
||||
repetition_penalty = state.get('repetition_penalty', 1.0)
|
||||
if repetition_penalty != 1.0:
|
||||
context_size = state.get('repetition_penalty_range', 1024)
|
||||
rep_processor = make_repetition_penalty(
|
||||
penalty=repetition_penalty,
|
||||
context_size=context_size
|
||||
)
|
||||
processors.append(rep_processor)
|
||||
|
||||
except ImportError:
|
||||
logger.warning("MLX repetition penalty not available")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create repetition penalty processor: {e}")
|
||||
|
||||
return processors if processors else None
|
||||
|
||||
def _map_parameters(self, state):
|
||||
"""Map text-generation-webui parameters to MLX parameters"""
|
||||
mlx_params = {}
|
||||
|
||||
# Max tokens
|
||||
if 'max_new_tokens' in state and state['max_new_tokens'] > 0:
|
||||
mlx_params['max_tokens'] = state['max_new_tokens']
|
||||
else:
|
||||
mlx_params['max_tokens'] = DEFAULT_MAX_TOKENS # Default
|
||||
|
||||
# Create custom sampler with advanced parameters
|
||||
sampler = self._create_mlx_sampler(state)
|
||||
if sampler:
|
||||
mlx_params['sampler'] = sampler
|
||||
|
||||
# Create logits processors
|
||||
logits_processors = self._create_logits_processors(state)
|
||||
if logits_processors:
|
||||
mlx_params['logits_processors'] = logits_processors
|
||||
|
||||
# Seed handling
|
||||
seed = state.get('seed', -1)
|
||||
if seed != -1:
|
||||
try:
|
||||
import mlx.core as mx
|
||||
mx.random.seed(seed)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to set MLX random seed: {e}")
|
||||
|
||||
return mlx_params
|
||||
|
||||
def _prepare_prompt(self, prompt):
|
||||
"""Prepare prompt with chat template if available"""
|
||||
if self.tokenizer.chat_template is not None:
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
formatted_prompt = self.tokenizer.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=False
|
||||
)
|
||||
return formatted_prompt
|
||||
return prompt
|
||||
|
||||
def generate(self, prompt, state):
|
||||
"""Non-streaming generation with advanced sampling"""
|
||||
try:
|
||||
from mlx_lm.generate import generate_step
|
||||
import mlx.core as mx
|
||||
except ImportError:
|
||||
logger.error("mlx-lm not found. Please install with: pip install mlx-lm")
|
||||
return ""
|
||||
|
||||
if self.model is None or self.tokenizer is None:
|
||||
logger.error("MLX model not loaded")
|
||||
return ""
|
||||
|
||||
try:
|
||||
# Prepare the prompt
|
||||
formatted_prompt = self._prepare_prompt(prompt)
|
||||
|
||||
# Tokenize the prompt
|
||||
prompt_tokens = self.tokenizer.encode(formatted_prompt)
|
||||
prompt_array = mx.array(prompt_tokens)
|
||||
|
||||
# Map parameters for MLX
|
||||
mlx_params = self._map_parameters(state)
|
||||
|
||||
# Remove max_tokens from params for generate_step
|
||||
max_tokens = mlx_params.pop('max_tokens', 512)
|
||||
|
||||
# Generate all tokens at once
|
||||
generated_tokens = []
|
||||
|
||||
for token, logprobs in generate_step(
|
||||
prompt_array,
|
||||
self.model,
|
||||
max_tokens=max_tokens,
|
||||
**mlx_params
|
||||
):
|
||||
# Handle both MLX arrays and direct integers
|
||||
if hasattr(token, 'item'):
|
||||
token_id = int(token.item())
|
||||
else:
|
||||
token_id = int(token)
|
||||
generated_tokens.append(token_id)
|
||||
|
||||
# Check for stop conditions
|
||||
if shared.stop_everything:
|
||||
break
|
||||
|
||||
# Decode all generated tokens
|
||||
if generated_tokens:
|
||||
response = self.tokenizer.decode(generated_tokens)
|
||||
return response
|
||||
else:
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"MLX generation failed: {str(e)}")
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
def generate_with_streaming(self, prompt, state):
|
||||
"""True streaming generation using MLX generate_step"""
|
||||
try:
|
||||
from mlx_lm.generate import generate_step
|
||||
import mlx.core as mx
|
||||
except ImportError:
|
||||
logger.error("mlx-lm not found. Please install with: pip install mlx-lm")
|
||||
yield ""
|
||||
return
|
||||
|
||||
if self.model is None or self.tokenizer is None:
|
||||
logger.error("MLX model not loaded")
|
||||
yield ""
|
||||
return
|
||||
|
||||
try:
|
||||
# Prepare the prompt
|
||||
formatted_prompt = self._prepare_prompt(prompt)
|
||||
|
||||
# Tokenize the prompt
|
||||
prompt_tokens = self.tokenizer.encode(formatted_prompt)
|
||||
prompt_array = mx.array(prompt_tokens)
|
||||
|
||||
# Map parameters for MLX
|
||||
mlx_params = self._map_parameters(state)
|
||||
|
||||
# Remove max_tokens from params for generate_step (use different name)
|
||||
max_tokens = mlx_params.pop('max_tokens', 512)
|
||||
|
||||
# Initialize streaming generation
|
||||
generated_tokens = []
|
||||
generated_text = ""
|
||||
|
||||
# Use generate_step for true streaming
|
||||
for token, logprobs in generate_step(
|
||||
prompt_array,
|
||||
self.model,
|
||||
max_tokens=max_tokens,
|
||||
**mlx_params
|
||||
):
|
||||
# Handle both MLX arrays and direct integers
|
||||
if hasattr(token, 'item'):
|
||||
token_id = int(token.item())
|
||||
else:
|
||||
token_id = int(token)
|
||||
generated_tokens.append(token_id)
|
||||
|
||||
# Decode the new token
|
||||
try:
|
||||
# Decode just the new token
|
||||
new_text = self.tokenizer.decode([token_id])
|
||||
generated_text += new_text
|
||||
|
||||
# Yield the accumulated text so far
|
||||
yield generated_text
|
||||
|
||||
except Exception as decode_error:
|
||||
logger.warning(f"Failed to decode token {token_id}: {decode_error}")
|
||||
continue
|
||||
|
||||
# Check for stop conditions
|
||||
if shared.stop_everything:
|
||||
break
|
||||
|
||||
# Final yield with complete text
|
||||
if generated_text:
|
||||
yield generated_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"MLX streaming generation failed: {str(e)}")
|
||||
traceback.print_exc()
|
||||
yield ""
|
||||
|
||||
def encode(self, text, add_bos_token=False, **kwargs):
|
||||
"""Encode text to tokens"""
|
||||
if self.tokenizer is None:
|
||||
import torch
|
||||
return torch.tensor([[]], dtype=torch.long)
|
||||
|
||||
try:
|
||||
# MLX tokenizer encode method
|
||||
tokens = self.tokenizer.encode(text)
|
||||
|
||||
# Convert to tensor format expected by webui
|
||||
import torch
|
||||
tokens_tensor = torch.tensor([tokens], dtype=torch.long)
|
||||
return tokens_tensor
|
||||
except Exception as e:
|
||||
logger.error(f"MLX tokenization failed: {str(e)}")
|
||||
# Return empty tensor on failure
|
||||
import torch
|
||||
return torch.tensor([[]], dtype=torch.long)
|
||||
|
||||
def decode(self, token_ids, **kwargs):
|
||||
"""Decode tokens to text"""
|
||||
if self.tokenizer is None:
|
||||
return ""
|
||||
|
||||
try:
|
||||
# MLX tokenizer decode method
|
||||
text = self.tokenizer.decode(token_ids)
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.error(f"MLX detokenization failed: {str(e)}")
|
||||
return ""
|
||||
|
||||
def unload(self):
|
||||
"""Unload the model to free memory"""
|
||||
self.model = None
|
||||
self.tokenizer = None
|
||||
logger.info("MLX model unloaded")
|
||||
|
|
@ -23,6 +23,7 @@ def load_model(model_name, loader=None):
|
|||
'ExLlamav2_HF': ExLlamav2_HF_loader,
|
||||
'ExLlamav2': ExLlamav2_loader,
|
||||
'TensorRT-LLM': TensorRT_LLM_loader,
|
||||
'MLX': MLX_loader,
|
||||
}
|
||||
|
||||
metadata = get_model_metadata(model_name)
|
||||
|
|
@ -53,7 +54,7 @@ def load_model(model_name, loader=None):
|
|||
return None, None
|
||||
|
||||
shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
|
||||
if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp':
|
||||
if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp' or loader == 'MLX':
|
||||
shared.settings['truncation_length'] = shared.args.ctx_size
|
||||
|
||||
shared.is_multimodal = False
|
||||
|
|
@ -131,6 +132,19 @@ def TensorRT_LLM_loader(model_name):
|
|||
return model
|
||||
|
||||
|
||||
def MLX_loader(model_name):
|
||||
try:
|
||||
from modules.mlx_loader import MLXModel
|
||||
except ModuleNotFoundError:
|
||||
raise ModuleNotFoundError("Failed to import MLX loader. Please install mlx-lm: pip install mlx-lm")
|
||||
|
||||
result = MLXModel.from_pretrained(model_name)
|
||||
if result is None:
|
||||
raise RuntimeError(f"Failed to load MLX model: {model_name}. Check the logs above for specific error details.")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def unload_model(keep_model_name=False):
|
||||
if shared.model is None:
|
||||
return
|
||||
|
|
@ -142,6 +156,8 @@ def unload_model(keep_model_name=False):
|
|||
shared.model.unload()
|
||||
elif model_class_name in ['Exllamav2Model', 'Exllamav2HF'] and hasattr(shared.model, 'unload'):
|
||||
shared.model.unload()
|
||||
elif shared.model.__class__.__name__ == 'MLXModel':
|
||||
shared.model.unload()
|
||||
|
||||
shared.model = shared.tokenizer = None
|
||||
shared.lora_names = []
|
||||
|
|
|
|||
|
|
@ -208,6 +208,12 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
|
|||
loader = 'llama.cpp'
|
||||
elif re.match(r'.*\.gguf', model_name.lower()):
|
||||
loader = 'llama.cpp'
|
||||
elif hf_quant_method == 'mlx':
|
||||
loader = 'MLX'
|
||||
elif re.match(r'.*\.mlx', model_name.lower()):
|
||||
loader = 'MLX'
|
||||
elif model_name.lower().startswith('mlx-community'):
|
||||
loader = 'MLX'
|
||||
elif hf_quant_method == 'exl3':
|
||||
loader = 'ExLlamav3'
|
||||
elif hf_quant_method in ['exl2', 'gptq']:
|
||||
|
|
|
|||
|
|
@ -74,7 +74,8 @@ group.add_argument('--row-split', action='store_true', help='Split the model by
|
|||
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
|
||||
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
|
||||
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
||||
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
|
||||
group.add_argument('--batch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size.')
|
||||
group.add_argument('--ubatch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the max physical batch size for computation (device level).')
|
||||
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
|
||||
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
|
||||
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
|
|||
yield ''
|
||||
return
|
||||
|
||||
if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']:
|
||||
if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel', 'MLXModel']:
|
||||
generate_func = generate_reply_custom
|
||||
else:
|
||||
generate_func = generate_reply_HF
|
||||
|
|
@ -148,7 +148,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
|
|||
if truncation_length is not None:
|
||||
input_ids = input_ids[:, -truncation_length:]
|
||||
|
||||
if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel'] or shared.args.cpu:
|
||||
if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel', 'MLXModel'] or shared.args.cpu:
|
||||
return input_ids
|
||||
else:
|
||||
device = get_device()
|
||||
|
|
|
|||
|
|
@ -129,6 +129,7 @@ def list_model_elements():
|
|||
'threads',
|
||||
'threads_batch',
|
||||
'batch_size',
|
||||
'ubatch_size',
|
||||
'ctx_size',
|
||||
'cache_type',
|
||||
'tensor_split',
|
||||
|
|
|
|||
|
|
@ -84,6 +84,7 @@ def create_ui():
|
|||
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
|
||||
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
|
||||
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
|
||||
shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
|
||||
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
|
||||
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
|
||||
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
|
||||
|
|
|
|||
|
|
@ -40,10 +40,10 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# AMD wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# AMD wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||
|
|
|
|||
|
|
@ -38,5 +38,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# Mac wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ html2text==2025.4.15
|
|||
huggingface-hub==0.36.0
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
mlx-lm>=0.26.3
|
||||
numpy==2.2.*
|
||||
pandas
|
||||
peft==0.18.*
|
||||
|
|
@ -38,5 +39,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# Mac wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
|
|
|
|||
|
|
@ -38,5 +38,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# llama.cpp (CPU only, AVX2)
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
|
|
|
|||
|
|
@ -38,5 +38,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# llama.cpp (CPU only, no AVX2)
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
|
|
|
|||
|
|
@ -40,10 +40,10 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
|
||||
|
|
|
|||
|
|
@ -23,5 +23,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
|
|
|
|||
|
|
@ -23,5 +23,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# AMD wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
|
|
|
|||
|
|
@ -23,5 +23,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# AMD wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
|
|
|
|||
|
|
@ -23,5 +23,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# Mac wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ html2text==2025.4.15
|
|||
huggingface-hub==0.36.0
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
mlx-lm>=0.26.3
|
||||
numpy==2.2.*
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
|
|
@ -23,5 +24,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# Mac wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
|
|
|
|||
|
|
@ -23,5 +23,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# llama.cpp (CPU only, AVX2)
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
|
|
|
|||
|
|
@ -23,5 +23,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# llama.cpp (CPU only, no AVX2)
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
|
|
|
|||
|
|
@ -23,5 +23,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
|
|
|
|||
|
|
@ -23,5 +23,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# Vulkan wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
|
|
|
|||
|
|
@ -23,5 +23,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
|
|
|
|||
Loading…
Reference in a new issue