Merge 3f1f0f0f7f into bd9f2de73a

Merge pull request #7331 from oobabooga/dev
Merge dev branch
2025-12-06 07:12:10 +01:00 · 2025-12-02 14:06:14 -06:00 · 2025-11-28 23:00:01 -03:00 · 2025-11-28 22:59:36 -03:00 · 2025-11-28 22:48:05 -03:00 · 2025-11-28 06:45:05 -08:00
30 changed files with 538 additions and 86 deletions
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@ -57,7 +57,7 @@ jobs:
        id: set-matrix
        run: |
          $matrix = @{
-              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-13', 'macos-14')
+              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
              'pyver' = @("3.11")
              'avx' = @("AVX2")
          }
--- a/extensions/coqui_tts/requirements.txt
+++ b/extensions/coqui_tts/requirements.txt
@ -1 +1 @@
-coqui-tts==0.25.1
+coqui-tts>=0.27.0
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@ -196,50 +196,45 @@ def extract_thinking_block(string):
    return None, string


-@functools.lru_cache(maxsize=None)
-def convert_to_markdown(string, message_id=None):
-    if not string:
+def build_thinking_block(thinking_content, message_id, has_remaining_content):
+    """Build HTML for a thinking block."""
+    if thinking_content is None:
+        return None
+
+    # Process the thinking content through markdown
+    thinking_html = process_markdown_content(thinking_content)
+
+    # Generate unique ID for the thinking block
+    block_id = f"thinking-{message_id}-0"
+
+    # Check if thinking is complete or still in progress
+    is_streaming = not has_remaining_content
+    title_text = "Thinking..." if is_streaming else "Thought"
+
+    return f'''
+    <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
+        <summary class="thinking-header">
+            {info_svg_small}
+            <span class="thinking-title">{title_text}</span>
+        </summary>
+        <div class="thinking-content pretty_scrollbar">{thinking_html}</div>
+    </details>
+    '''
+
+
+def build_main_content_block(content):
+    """Build HTML for the main content block."""
+    if not content:
        return ""

-    # Use a default message ID if none provided
-    if message_id is None:
-        message_id = "unknown"
-
-    # Extract thinking block if present
-    thinking_content, remaining_content = extract_thinking_block(string)
-
-    # Process the main content
-    html_output = process_markdown_content(remaining_content)
-
-    # If thinking content was found, process it using the same function
-    if thinking_content is not None:
-        thinking_html = process_markdown_content(thinking_content)
-
-        # Generate unique ID for the thinking block
-        block_id = f"thinking-{message_id}-0"
-
-        # Check if thinking is complete or still in progress
-        is_streaming = not remaining_content
-        title_text = "Thinking..." if is_streaming else "Thought"
-
-        thinking_block = f'''
-        <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
-            <summary class="thinking-header">
-                {info_svg_small}
-                <span class="thinking-title">{title_text}</span>
-            </summary>
-            <div class="thinking-content pretty_scrollbar">{thinking_html}</div>
-        </details>
-        '''
-
-        # Prepend the thinking block to the message HTML
-        html_output = thinking_block + html_output
-
-    return html_output
+    return process_markdown_content(content)


 def process_markdown_content(string):
-    """Process a string through the markdown conversion pipeline."""
+    """
+    Process a string through the markdown conversion pipeline.
+    Uses robust manual parsing to ensure correct LaTeX and Code Block rendering.
+    """
    if not string:
        return ""

@ -280,7 +275,7 @@ def process_markdown_content(string):
    pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
    string = pattern.sub(replace_blockquote, string)

-    # Code
+    # Code block standardization
    string = string.replace('\\begin{code}', '```')
    string = string.replace('\\end{code}', '```')
    string = string.replace('\\begin{align*}', '$$')
@ -301,6 +296,7 @@ def process_markdown_content(string):
    is_code = False
    is_latex = False

+    # Manual line iteration for robust structure parsing
    for line in string.split('\n'):
        stripped_line = line.strip()

@ -371,6 +367,39 @@ def process_markdown_content(string):
    return html_output


+@functools.lru_cache(maxsize=None)
+def convert_to_markdown(string, message_id=None):
+    """
+    Convert a string to markdown HTML with support for multiple block types.
+    Blocks are assembled in order: thinking, main content, etc.
+    """
+    if not string:
+        return ""
+
+    # Use a default message ID if none provided
+    if message_id is None:
+        message_id = "unknown"
+
+    # Extract different components from the string
+    thinking_content, remaining_content = extract_thinking_block(string)
+
+    # Build individual HTML blocks
+    blocks = []
+
+    # Add thinking block if present
+    thinking_html = build_thinking_block(thinking_content, message_id, bool(remaining_content))
+    if thinking_html:
+        blocks.append(thinking_html)
+
+    # Add main content block
+    main_html = build_main_content_block(remaining_content)
+    if main_html:
+        blocks.append(main_html)
+
+    # Assemble all blocks into final HTML
+    return ''.join(blocks)
+
+
 def convert_to_markdown_wrapped(string, message_id=None, use_cache=True):
    '''
    Used to avoid caching convert_to_markdown calls during streaming.
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -317,6 +317,7 @@ class LlamaServer:
            "--ctx-size", str(shared.args.ctx_size),
            "--gpu-layers", str(shared.args.gpu_layers),
            "--batch-size", str(shared.args.batch_size),
+            "--ubatch-size", str(shared.args.ubatch_size),
            "--port", str(self.port),
            "--no-webui",
            "--flash-attn", "on",
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -10,6 +10,7 @@ loaders_and_params = OrderedDict({
        'threads',
        'threads_batch',
        'batch_size',
+        'ubatch_size',
        'ctx_size',
        'cache_type',
        'tensor_split',
@ -104,6 +105,9 @@ loaders_and_params = OrderedDict({
        'ctx_size',
        'cpp_runner',
        'tensorrt_llm_info',
+    ],
+    'MLX': [
+        'ctx_size',
    ]
 })

@ -358,6 +362,26 @@ loaders_samplers = {
        'presence_penalty',
        'auto_max_new_tokens',
        'ban_eos_token',
+    },
+    'MLX': {
+        'temperature',
+        'dynatemp_low',
+        'dynatemp_high',
+        'dynatemp_exponent',
+        'top_p',
+        'top_k',
+        'min_p',
+        'xtc_threshold',
+        'xtc_probability',
+        'repetition_penalty',
+        'repetition_penalty_range',
+        'auto_max_new_tokens',
+        'ban_eos_token',
+        'add_bos_token',
+        'skip_special_tokens',
+        'dynamic_temperature',
+        'seed',
+        'sampler_priority',
    }
 }

--- a/modules/mlx_loader.py
+++ b/modules/mlx_loader.py
@ -0,0 +1,371 @@
+import platform
+import traceback
+from pathlib import Path
+
+import modules.shared as shared
+from modules.logging_colors import logger
+
+# Constants for MLX configuration
+MLX_TOP_P_DISABLED = 0.0  # MLX expects 0.0 to disable top_p
+DEFAULT_MAX_TOKENS = 512  # Default maximum tokens for generation
+
+
+def is_apple_silicon():
+    """Check if running on Apple Silicon"""
+    return platform.system() == "Darwin" and platform.machine() == "arm64"
+
+
+class MLXModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.model_name = None
+
+    @classmethod
+    def from_pretrained(cls, model_name):
+        """Load MLX model from path or HuggingFace repository"""
+        
+        if not is_apple_silicon():
+            logger.warning("MLX backend is only supported on Apple Silicon. Falling back to Transformers.")
+            return None
+            
+        try:
+            from mlx_lm import load
+        except ImportError:
+            logger.error("mlx-lm not found. Please install with: pip install mlx-lm")
+            return None
+
+        instance = cls()
+        instance.model_name = model_name
+        
+        try:
+            # Determine the model path/name
+            model_path = cls._resolve_model_path(model_name)
+            
+            logger.info(f"Loading MLX model: {model_path}")
+            tokenizer_config = {"trust_remote_code": True}
+            model, tokenizer = load(model_path, tokenizer_config=tokenizer_config)
+            
+            instance.model = model
+            instance.tokenizer = tokenizer
+            
+            logger.info(f"Successfully loaded MLX model: {model_name}")
+            return instance  # Return instance for compatibility
+            
+        except Exception as e:
+            error_msg = str(e)
+            if "not supported" in error_msg.lower():
+                logger.error(f"MLX model {model_name} uses an unsupported model type: {error_msg}")
+                logger.error("Consider using a different loader or updating mlx-lm to a newer version")
+            else:
+                logger.error(f"Failed to load MLX model {model_name}: {error_msg}")
+                traceback.print_exc()
+            return None
+
+    @staticmethod
+    def _resolve_model_path(model_name):
+        """Resolve model path - either local path or HuggingFace repo"""
+        model_path = Path(f'{shared.args.model_dir}/{model_name}')
+        
+        if model_path.exists():
+            # Local model path
+            return str(model_path)
+        elif '/' in model_name:
+            # Already has repo/model format
+            return model_name
+        elif '_' in model_name and not model_name.startswith('_'):
+            # Handle repo_name format - convert first underscore to slash
+            # e.g., "mlx-community_model-name" -> "mlx-community/model-name"
+            parts = model_name.split('_', 1)
+            if len(parts) == 2:
+                return f"{parts[0]}/{parts[1]}"
+            return model_name
+        else:
+            # Default to mlx-community for standalone model names
+            return f"mlx-community/{model_name}"
+
+    def _create_mlx_sampler(self, state):
+        """Create MLX sampler with webui parameters"""
+        try:
+            from mlx_lm.sample_utils import make_sampler
+            
+            # Extract sampling parameters from state
+            temperature = state.get('temperature', 1.0)
+            top_p = state.get('top_p', 1.0)
+            top_k = state.get('top_k', 0)  # 0 means no top_k limit
+            min_p = state.get('min_p', 0.0)
+            
+            # Handle dynamic temperature
+            if state.get('dynamic_temperature', False):
+                temp_low = state.get('dynatemp_low', 1.0)
+                temp_high = state.get('dynatemp_high', 1.0)
+                temperature = (temp_low + temp_high) / 2  # Simple average for now
+            
+            # XTC sampling parameters
+            xtc_probability = state.get('xtc_probability', 0.0)
+            xtc_threshold = state.get('xtc_threshold', 0.1)
+            
+            # Ensure temperature is not zero (causes issues with MLX)
+            if temperature <= 0.0:
+                temperature = 0.01
+            
+            # Log sampling parameters for debugging
+            if shared.args.verbose:
+                logger.info(f"MLX Sampler - temp: {temperature}, top_p: {top_p}, top_k: {top_k}, min_p: {min_p}")
+            
+            # Create the sampler
+            sampler = make_sampler(
+                temp=temperature,
+                top_p=top_p if top_p < 1.0 else MLX_TOP_P_DISABLED,  # MLX expects 0.0 to disable
+                top_k=int(top_k) if top_k > 0 else 0,
+                min_p=min_p,
+                min_tokens_to_keep=1,  # Always keep at least one token
+                xtc_probability=xtc_probability,
+                xtc_threshold=xtc_threshold,
+                xtc_special_tokens=[]  # Could be customized later
+            )
+            
+            return sampler
+            
+        except ImportError:
+            logger.warning("MLX sampling utilities not available, using default sampler")
+            return None
+        except Exception as e:
+            logger.error(f"Failed to create MLX sampler: {e}")
+            return None
+    
+    def _create_logits_processors(self, state):
+        """Create logits processors for repetition penalty, etc."""
+        processors = []
+        
+        try:
+            from mlx_lm.sample_utils import make_repetition_penalty
+            
+            # Repetition penalty
+            repetition_penalty = state.get('repetition_penalty', 1.0)
+            if repetition_penalty != 1.0:
+                context_size = state.get('repetition_penalty_range', 1024)
+                rep_processor = make_repetition_penalty(
+                    penalty=repetition_penalty,
+                    context_size=context_size
+                )
+                processors.append(rep_processor)
+                
+        except ImportError:
+            logger.warning("MLX repetition penalty not available")
+        except Exception as e:
+            logger.error(f"Failed to create repetition penalty processor: {e}")
+        
+        return processors if processors else None
+    
+    def _map_parameters(self, state):
+        """Map text-generation-webui parameters to MLX parameters"""
+        mlx_params = {}
+        
+        # Max tokens
+        if 'max_new_tokens' in state and state['max_new_tokens'] > 0:
+            mlx_params['max_tokens'] = state['max_new_tokens']
+        else:
+            mlx_params['max_tokens'] = DEFAULT_MAX_TOKENS  # Default
+        
+        # Create custom sampler with advanced parameters
+        sampler = self._create_mlx_sampler(state)
+        if sampler:
+            mlx_params['sampler'] = sampler
+        
+        # Create logits processors
+        logits_processors = self._create_logits_processors(state)
+        if logits_processors:
+            mlx_params['logits_processors'] = logits_processors
+        
+        # Seed handling
+        seed = state.get('seed', -1)
+        if seed != -1:
+            try:
+                import mlx.core as mx
+                mx.random.seed(seed)
+            except Exception as e:
+                logger.warning(f"Failed to set MLX random seed: {e}")
+        
+        return mlx_params
+
+    def _prepare_prompt(self, prompt):
+        """Prepare prompt with chat template if available"""
+        if self.tokenizer.chat_template is not None:
+            messages = [{"role": "user", "content": prompt}]
+            formatted_prompt = self.tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=False
+            )
+            return formatted_prompt
+        return prompt
+
+    def generate(self, prompt, state):
+        """Non-streaming generation with advanced sampling"""
+        try:
+            from mlx_lm.generate import generate_step
+            import mlx.core as mx
+        except ImportError:
+            logger.error("mlx-lm not found. Please install with: pip install mlx-lm")
+            return ""
+
+        if self.model is None or self.tokenizer is None:
+            logger.error("MLX model not loaded")
+            return ""
+
+        try:
+            # Prepare the prompt
+            formatted_prompt = self._prepare_prompt(prompt)
+            
+            # Tokenize the prompt
+            prompt_tokens = self.tokenizer.encode(formatted_prompt)
+            prompt_array = mx.array(prompt_tokens)
+            
+            # Map parameters for MLX
+            mlx_params = self._map_parameters(state)
+            
+            # Remove max_tokens from params for generate_step
+            max_tokens = mlx_params.pop('max_tokens', 512)
+            
+            # Generate all tokens at once
+            generated_tokens = []
+            
+            for token, logprobs in generate_step(
+                prompt_array, 
+                self.model, 
+                max_tokens=max_tokens,
+                **mlx_params
+            ):
+                # Handle both MLX arrays and direct integers
+                if hasattr(token, 'item'):
+                    token_id = int(token.item())
+                else:
+                    token_id = int(token)
+                generated_tokens.append(token_id)
+                
+                # Check for stop conditions
+                if shared.stop_everything:
+                    break
+            
+            # Decode all generated tokens
+            if generated_tokens:
+                response = self.tokenizer.decode(generated_tokens)
+                return response
+            else:
+                return ""
+            
+        except Exception as e:
+            logger.error(f"MLX generation failed: {str(e)}")
+            traceback.print_exc()
+            return ""
+
+    def generate_with_streaming(self, prompt, state):
+        """True streaming generation using MLX generate_step"""
+        try:
+            from mlx_lm.generate import generate_step
+            import mlx.core as mx
+        except ImportError:
+            logger.error("mlx-lm not found. Please install with: pip install mlx-lm")
+            yield ""
+            return
+
+        if self.model is None or self.tokenizer is None:
+            logger.error("MLX model not loaded")
+            yield ""
+            return
+
+        try:
+            # Prepare the prompt
+            formatted_prompt = self._prepare_prompt(prompt)
+            
+            # Tokenize the prompt
+            prompt_tokens = self.tokenizer.encode(formatted_prompt)
+            prompt_array = mx.array(prompt_tokens)
+            
+            # Map parameters for MLX
+            mlx_params = self._map_parameters(state)
+            
+            # Remove max_tokens from params for generate_step (use different name)
+            max_tokens = mlx_params.pop('max_tokens', 512)
+            
+            # Initialize streaming generation
+            generated_tokens = []
+            generated_text = ""
+            
+            # Use generate_step for true streaming
+            for token, logprobs in generate_step(
+                prompt_array, 
+                self.model, 
+                max_tokens=max_tokens,
+                **mlx_params
+            ):
+                # Handle both MLX arrays and direct integers
+                if hasattr(token, 'item'):
+                    token_id = int(token.item())
+                else:
+                    token_id = int(token)
+                generated_tokens.append(token_id)
+                
+                # Decode the new token
+                try:
+                    # Decode just the new token
+                    new_text = self.tokenizer.decode([token_id])
+                    generated_text += new_text
+                    
+                    # Yield the accumulated text so far
+                    yield generated_text
+                    
+                except Exception as decode_error:
+                    logger.warning(f"Failed to decode token {token_id}: {decode_error}")
+                    continue
+                
+                # Check for stop conditions
+                if shared.stop_everything:
+                    break
+            
+            # Final yield with complete text
+            if generated_text:
+                yield generated_text
+            
+        except Exception as e:
+            logger.error(f"MLX streaming generation failed: {str(e)}")
+            traceback.print_exc()
+            yield ""
+
+    def encode(self, text, add_bos_token=False, **kwargs):
+        """Encode text to tokens"""
+        if self.tokenizer is None:
+            import torch
+            return torch.tensor([[]], dtype=torch.long)
+        
+        try:
+            # MLX tokenizer encode method
+            tokens = self.tokenizer.encode(text)
+            
+            # Convert to tensor format expected by webui
+            import torch
+            tokens_tensor = torch.tensor([tokens], dtype=torch.long)
+            return tokens_tensor
+        except Exception as e:
+            logger.error(f"MLX tokenization failed: {str(e)}")
+            # Return empty tensor on failure
+            import torch
+            return torch.tensor([[]], dtype=torch.long)
+
+    def decode(self, token_ids, **kwargs):
+        """Decode tokens to text"""
+        if self.tokenizer is None:
+            return ""
+        
+        try:
+            # MLX tokenizer decode method
+            text = self.tokenizer.decode(token_ids)
+            return text
+        except Exception as e:
+            logger.error(f"MLX detokenization failed: {str(e)}")
+            return ""
+
+    def unload(self):
+        """Unload the model to free memory"""
+        self.model = None
+        self.tokenizer = None
+        logger.info("MLX model unloaded")
--- a/modules/models.py
+++ b/modules/models.py
@ -23,6 +23,7 @@ def load_model(model_name, loader=None):
        'ExLlamav2_HF': ExLlamav2_HF_loader,
        'ExLlamav2': ExLlamav2_loader,
        'TensorRT-LLM': TensorRT_LLM_loader,
+        'MLX': MLX_loader,
    }

    metadata = get_model_metadata(model_name)
@ -53,7 +54,7 @@ def load_model(model_name, loader=None):
        return None, None

    shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
-    if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp':
+    if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp' or loader == 'MLX':
        shared.settings['truncation_length'] = shared.args.ctx_size

    shared.is_multimodal = False
@ -131,6 +132,19 @@ def TensorRT_LLM_loader(model_name):
    return model


+def MLX_loader(model_name):
+    try:
+        from modules.mlx_loader import MLXModel
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError("Failed to import MLX loader. Please install mlx-lm: pip install mlx-lm")
+
+    result = MLXModel.from_pretrained(model_name)
+    if result is None:
+        raise RuntimeError(f"Failed to load MLX model: {model_name}. Check the logs above for specific error details.")
+    
+    return result
+
+
 def unload_model(keep_model_name=False):
    if shared.model is None:
        return
@ -142,6 +156,8 @@ def unload_model(keep_model_name=False):
        shared.model.unload()
    elif model_class_name in ['Exllamav2Model', 'Exllamav2HF'] and hasattr(shared.model, 'unload'):
        shared.model.unload()
+    elif shared.model.__class__.__name__ == 'MLXModel':
+        shared.model.unload()

    shared.model = shared.tokenizer = None
    shared.lora_names = []
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -208,6 +208,12 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
        loader = 'llama.cpp'
    elif re.match(r'.*\.gguf', model_name.lower()):
        loader = 'llama.cpp'
+    elif hf_quant_method == 'mlx':
+        loader = 'MLX'
+    elif re.match(r'.*\.mlx', model_name.lower()):
+        loader = 'MLX'
+    elif model_name.lower().startswith('mlx-community'):
+        loader = 'MLX'
    elif hf_quant_method == 'exl3':
        loader = 'ExLlamav3'
    elif hf_quant_method in ['exl2', 'gptq']:
--- a/modules/shared.py
+++ b/modules/shared.py
@ -74,7 +74,8 @@ group.add_argument('--row-split', action='store_true', help='Split the model by
 group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
-group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
+group.add_argument('--batch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size.')
+group.add_argument('--ubatch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the max physical batch size for computation (device level).')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
 group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -40,7 +40,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
            yield ''
            return

-        if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel']:
+        if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel', 'MLXModel']:
            generate_func = generate_reply_custom
        else:
            generate_func = generate_reply_HF
@ -148,7 +148,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
        if truncation_length is not None:
            input_ids = input_ids[:, -truncation_length:]

-        if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel'] or shared.args.cpu:
+        if shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav3Model', 'TensorRTLLMModel', 'MLXModel'] or shared.args.cpu:
            return input_ids
        else:
            device = get_device()
--- a/modules/ui.py
+++ b/modules/ui.py
@ -129,6 +129,7 @@ def list_model_elements():
        'threads',
        'threads_batch',
        'batch_size',
+        'ubatch_size',
        'ctx_size',
        'cache_type',
        'tensor_split',
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -84,6 +84,7 @@ def create_ui():
                                shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
                                shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                                shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
+                                shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
                                shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                                shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
                                shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@ -40,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@ -38,7 +38,7 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@ -38,7 +38,7 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@ -38,5 +38,5 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@ -8,6 +8,7 @@ html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
+mlx-lm>=0.26.3
 numpy==2.2.*
 pandas
 peft==0.18.*
@ -38,5 +39,5 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@ -38,5 +38,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@ -38,5 +38,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@ -40,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@ -4,6 +4,7 @@ html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
+mlx-lm>=0.26.3
 numpy==2.2.*
 pydantic==2.11.0
 PyPDF2==3.0.1
@ -23,5 +24,5 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
Author	SHA1	Message	Date
SB Yoon	ac42d48774	Merge `3f1f0f0f7f` into `bd9f2de73a`	2025-12-02 14:06:14 -06:00
oobabooga	bd9f2de73a	Merge pull request #7331 from oobabooga/dev Merge dev branch	2025-11-28 23:00:01 -03:00
aidevtime	661e42d2b7	fix(deps): upgrade coqui-tts to >=0.27.0 for transformers 4.55 compatibility (#7329 )	2025-11-28 22:59:36 -03:00
oobabooga	5327bc9397	Update modules/shared.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>	2025-11-28 22:48:05 -03:00
oobabooga	78b315344a	Update exllamav3	2025-11-28 06:45:05 -08:00
oobabooga	3cad0cd4c1	Update llama.cpp	2025-11-28 03:52:37 -08:00
GodEmperor785	400bb0694b	Add slider for --ubatch-size for llama.cpp loader, change defaults for better MoE performance (#7316 )	2025-11-21 16:56:02 -03:00
oobabooga	8f0048663d	More modular HTML generator	2025-11-21 07:09:16 -08:00
oobabooga	b0baf7518b	Remove macos x86-64 portable builds (macos-13 runner deprecated by GitHub)	2025-11-19 06:07:15 -08:00
SB Yoon	3f1f0f0f7f	Merge branch 'main' into main	2025-08-19 23:17:09 -06:00
SB Yoon	297fd7a67a	Fix the return value in MLX loader and add named constants for magic numbers	2025-08-08 23:20:23 -06:00
SB Yoon	fe0bef40d2	Merge branch 'oobabooga:main' into main	2025-08-08 23:00:15 -06:00
SB Yoon	10947b3e53	Merge branch 'oobabooga:main' into main	2025-07-21 17:39:26 -06:00
SB Yoon	25c8f1fda3	Fix model usage issue	2025-07-17 23:52:17 -06:00
SB Yoon	365a997a7f	Add MLX support	2025-07-17 18:49:13 -06:00