Merge pull request #7331 from oobabooga/dev

Merge dev branch
fix(deps): upgrade coqui-tts to >=0.27.0 for transformers 4.55 compatibility (#7329 )
2025-12-06 07:12:10 +01:00 · 2025-11-28 23:00:01 -03:00 · 2025-11-28 22:59:36 -03:00 · 2025-11-28 22:48:05 -03:00 · 2025-11-28 06:45:05 -08:00 · 2025-11-28 03:52:37 -08:00
33 changed files with 390 additions and 118 deletions
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@ -41,6 +41,13 @@ jobs:
      version: ${{ inputs.version }}
      config: 'os:ubuntu-22.04'

+  build_release_rocm_linux:
+    name: ROCm Linux
+    uses: ./.github/workflows/build-portable-release-rocm.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
+
  build_release_cpu_windows:
    name: CPU Windows
    uses: ./.github/workflows/build-portable-release.yml
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@ -60,7 +60,7 @@ jobs:
              'os' = @('ubuntu-22.04', 'windows-2022')
              'pyver' = @("3.11")
              'avx' = @("AVX2")
-              'cuda' = @("11.7", "12.4")
+              'cuda' = @("12.4")
          }

          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
@ -147,22 +147,13 @@ jobs:

            # Create CUDA-specific requirements file if needed
            cd "text-generation-webui-${VERSION_CLEAN}"
-            if [[ "$CUDA_VERSION" == "11.7" ]]; then
-                echo "Creating CUDA 11.7 specific requirements file"
-                sed 's/cu124/cu117/g' "$BASE_REQ_FILE" > requirements_cuda_temp.txt
-                REQ_FILE="requirements_cuda_temp.txt"
-            else
-                REQ_FILE="$BASE_REQ_FILE"
-            fi
+            REQ_FILE="$BASE_REQ_FILE"

            # 4. Install packages
            echo "Installing Python packages from $REQ_FILE..."
            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"

            # 5. Clean up
-            if [[ "$CUDA_VERSION" == "11.7" ]]; then
-                rm requirements_cuda_temp.txt
-            fi
            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py

            # 6. Create ZIP file
--- a/.github/workflows/build-portable-release-rocm.yml
+++ b/.github/workflows/build-portable-release-rocm.yml
@ -0,0 +1,165 @@
+name: Build ROCm
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04')
+              'pyver' = @("3.11")
+              'avx' = @("AVX2")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      AVXVER: ${{ matrix.avx }}
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            AVX_SUPPORT="${{ matrix.avx }}"
+            VERSION="${{ inputs.version }}"
+
+            # 1. Set platform-specific variables (Linux only for ROCm)
+            PLATFORM="linux"
+            PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
+            PIP_PATH="portable_env/bin/python -m pip"
+            PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
+            rm start_macos.sh start_windows.bat
+
+            # 2. Download and extract Python
+            cd ..
+            echo "Downloading Python for $PLATFORM..."
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file based on AVX
+            if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
+                BASE_REQ_FILE="requirements/portable/requirements_amd.txt"
+            else
+                BASE_REQ_FILE="requirements/portable/requirements_amd_noavx2.txt"
+            fi
+            REQ_FILE="$BASE_REQ_FILE"
+
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # 4. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 5. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 6. Create ZIP file
+            cd ..
+            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-rocm.zip"
+            echo "Creating archive: $ZIP_NAME"
+
+            zip -r "$ZIP_NAME" "text-generation-webui-${VERSION_CLEAN}"
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-*.zip
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@ -57,7 +57,7 @@ jobs:
        id: set-matrix
        run: |
          $matrix = @{
-              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-13', 'macos-14')
+              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
              'pyver' = @("3.11")
              'avx' = @("AVX2")
          }
--- a/extensions/Training_PRO/script.py
+++ b/extensions/Training_PRO/script.py
@ -823,7 +823,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
        lora_model = get_peft_model(shared.model, config)
        if not always_override and Path(f"{lora_file_path}/adapter_model.bin").is_file():
            logger.info("Loading existing LoRA data...")
-            state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin")
+            state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin", weights_only=True)
            set_peft_model_state_dict(lora_model, state_dict_peft)

            print(f" + Continue Training on {RED}{lora_file_path}/adapter_model.bin{RESET}")
--- a/extensions/coqui_tts/requirements.txt
+++ b/extensions/coqui_tts/requirements.txt
@ -1 +1 @@
-coqui-tts==0.25.1
+coqui-tts>=0.27.0
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@ -196,50 +196,45 @@ def extract_thinking_block(string):
    return None, string


-@functools.lru_cache(maxsize=None)
-def convert_to_markdown(string, message_id=None):
-    if not string:
+def build_thinking_block(thinking_content, message_id, has_remaining_content):
+    """Build HTML for a thinking block."""
+    if thinking_content is None:
+        return None
+
+    # Process the thinking content through markdown
+    thinking_html = process_markdown_content(thinking_content)
+
+    # Generate unique ID for the thinking block
+    block_id = f"thinking-{message_id}-0"
+
+    # Check if thinking is complete or still in progress
+    is_streaming = not has_remaining_content
+    title_text = "Thinking..." if is_streaming else "Thought"
+
+    return f'''
+    <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
+        <summary class="thinking-header">
+            {info_svg_small}
+            <span class="thinking-title">{title_text}</span>
+        </summary>
+        <div class="thinking-content pretty_scrollbar">{thinking_html}</div>
+    </details>
+    '''
+
+
+def build_main_content_block(content):
+    """Build HTML for the main content block."""
+    if not content:
        return ""

-    # Use a default message ID if none provided
-    if message_id is None:
-        message_id = "unknown"
-
-    # Extract thinking block if present
-    thinking_content, remaining_content = extract_thinking_block(string)
-
-    # Process the main content
-    html_output = process_markdown_content(remaining_content)
-
-    # If thinking content was found, process it using the same function
-    if thinking_content is not None:
-        thinking_html = process_markdown_content(thinking_content)
-
-        # Generate unique ID for the thinking block
-        block_id = f"thinking-{message_id}-0"
-
-        # Check if thinking is complete or still in progress
-        is_streaming = not remaining_content
-        title_text = "Thinking..." if is_streaming else "Thought"
-
-        thinking_block = f'''
-        <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
-            <summary class="thinking-header">
-                {info_svg_small}
-                <span class="thinking-title">{title_text}</span>
-            </summary>
-            <div class="thinking-content pretty_scrollbar">{thinking_html}</div>
-        </details>
-        '''
-
-        # Prepend the thinking block to the message HTML
-        html_output = thinking_block + html_output
-
-    return html_output
+    return process_markdown_content(content)


 def process_markdown_content(string):
-    """Process a string through the markdown conversion pipeline."""
+    """
+    Process a string through the markdown conversion pipeline.
+    Uses robust manual parsing to ensure correct LaTeX and Code Block rendering.
+    """
    if not string:
        return ""

@ -280,7 +275,7 @@ def process_markdown_content(string):
    pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
    string = pattern.sub(replace_blockquote, string)

-    # Code
+    # Code block standardization
    string = string.replace('\\begin{code}', '```')
    string = string.replace('\\end{code}', '```')
    string = string.replace('\\begin{align*}', '$$')
@ -301,6 +296,7 @@ def process_markdown_content(string):
    is_code = False
    is_latex = False

+    # Manual line iteration for robust structure parsing
    for line in string.split('\n'):
        stripped_line = line.strip()

@ -371,6 +367,39 @@ def process_markdown_content(string):
    return html_output


+@functools.lru_cache(maxsize=None)
+def convert_to_markdown(string, message_id=None):
+    """
+    Convert a string to markdown HTML with support for multiple block types.
+    Blocks are assembled in order: thinking, main content, etc.
+    """
+    if not string:
+        return ""
+
+    # Use a default message ID if none provided
+    if message_id is None:
+        message_id = "unknown"
+
+    # Extract different components from the string
+    thinking_content, remaining_content = extract_thinking_block(string)
+
+    # Build individual HTML blocks
+    blocks = []
+
+    # Add thinking block if present
+    thinking_html = build_thinking_block(thinking_content, message_id, bool(remaining_content))
+    if thinking_html:
+        blocks.append(thinking_html)
+
+    # Add main content block
+    main_html = build_main_content_block(remaining_content)
+    if main_html:
+        blocks.append(main_html)
+
+    # Assemble all blocks into final HTML
+    return ''.join(blocks)
+
+
 def convert_to_markdown_wrapped(string, message_id=None, use_cache=True):
    '''
    Used to avoid caching convert_to_markdown calls during streaming.
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -317,6 +317,7 @@ class LlamaServer:
            "--ctx-size", str(shared.args.ctx_size),
            "--gpu-layers", str(shared.args.gpu_layers),
            "--batch-size", str(shared.args.batch_size),
+            "--ubatch-size", str(shared.args.ubatch_size),
            "--port", str(self.port),
            "--no-webui",
            "--flash-attn", "on",
@ -326,6 +327,8 @@ class LlamaServer:
            cmd += ["--threads", str(shared.args.threads)]
        if shared.args.threads_batch > 0:
            cmd += ["--threads-batch", str(shared.args.threads_batch)]
+        if shared.args.cpu_moe:
+            cmd.append("--cpu-moe")
        if shared.args.no_mmap:
            cmd.append("--no-mmap")
        if shared.args.mlock:
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -6,9 +6,11 @@ import gradio as gr
 loaders_and_params = OrderedDict({
    'llama.cpp': [
        'gpu_layers',
+        'cpu_moe',
        'threads',
        'threads_batch',
        'batch_size',
+        'ubatch_size',
        'ctx_size',
        'cache_type',
        'tensor_split',
--- a/modules/shared.py
+++ b/modules/shared.py
@ -66,6 +66,7 @@ group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the pr
 # llama.cpp
 group = parser.add_argument_group('llama.cpp')
 group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
+group.add_argument('--cpu-moe', action='store_true', help='Move the experts to the CPU (for MoE models).')
 group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.')
 group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
 group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
@ -73,7 +74,8 @@ group.add_argument('--row-split', action='store_true', help='Split the model by
 group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
-group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
+group.add_argument('--batch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size.')
+group.add_argument('--ubatch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the max physical batch size for computation (device level).')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
 group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
--- a/modules/training.py
+++ b/modules/training.py
@ -611,7 +611,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
            bf16=shared.args.bf16,
            optim=optimizer,
            logging_steps=2 if stop_at_loss > 0 else 5,
-            evaluation_strategy="steps" if eval_data is not None else "no",
+            eval_strategy="steps" if eval_data is not None else "no",
            eval_steps=math.ceil(eval_steps / gradient_accumulation_steps) if eval_data is not None else None,
            save_strategy="steps" if eval_data is not None else "no",
            output_dir=lora_file_path,
@ -620,7 +620,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
            # TODO: Enable multi-device support
            ddp_find_unused_parameters=None,
            no_cuda=shared.args.cpu,
-            use_ipex=True if is_torch_xpu_available() and not shared.args.cpu else False
+            # use_ipex=True if is_torch_xpu_available() and not shared.args.cpu else False
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
        callbacks=list([Callbacks()])
--- a/modules/ui.py
+++ b/modules/ui.py
@ -125,9 +125,11 @@ def list_model_elements():
        'loader',
        'cpu_memory',
        'gpu_layers',
+        'cpu_moe',
        'threads',
        'threads_batch',
        'batch_size',
+        'ubatch_size',
        'ctx_size',
        'cache_type',
        'tensor_split',
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -50,6 +50,7 @@ def create_ui():

                        with gr.Column():
                            shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
+                            shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
@ -83,6 +84,7 @@ def create_ui():
                                shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
                                shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                                shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
+                                shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
                                shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                                shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
                                shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
@ -94,7 +96,7 @@ def create_ui():
                                shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')

                            with gr.Column():
-                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
+                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='Use PyTorch in CPU mode.')
                                shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                                shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
                                shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@ -5,13 +5,14 @@ colorama
 datasets
 einops
 fastapi==0.112.4
-flash-linear-attention==0.3.2
+flash-linear-attention==0.4.0
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -25,7 +26,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb

@ -39,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb

@ -37,7 +38,7 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb

@ -37,7 +38,7 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb

@ -37,5 +38,5 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb

@ -37,6 +38,5 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb

@ -37,5 +38,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb

@ -37,5 +38,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@ -5,13 +5,14 @@ colorama
 datasets
 einops
 fastapi==0.112.4
-flash-linear-attention==0.3.2
+flash-linear-attention==0.4.0
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -25,7 +26,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb

@ -39,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb

--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+html2text==2025.4.15
+huggingface-hub==0.36.0
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+PyPDF2==3.0.1
+python-docx==1.1.2
+pyyaml
+requests
+rich
+tqdm
+
+# Gradio
+gradio==4.37.*
+https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.14
+sse-starlette==1.6.5
+tiktoken
+
+# AMD wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+html2text==2025.4.15
+huggingface-hub==0.36.0
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+PyPDF2==3.0.1
+python-docx==1.1.2
+pyyaml
+requests
+rich
+tqdm
+
+# Gradio
+gradio==4.37.*
+https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.14
+sse-starlette==1.6.5
+tiktoken
+
+# AMD wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,6 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,6 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -21,6 +22,6 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken

-# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+# Vulkan wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
+huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
Author	SHA1	Message	Date
oobabooga	bd9f2de73a	Merge pull request #7331 from oobabooga/dev Merge dev branch	2025-11-28 23:00:01 -03:00
aidevtime	661e42d2b7	fix(deps): upgrade coqui-tts to >=0.27.0 for transformers 4.55 compatibility (#7329 )	2025-11-28 22:59:36 -03:00
oobabooga	5327bc9397	Update modules/shared.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>	2025-11-28 22:48:05 -03:00
oobabooga	78b315344a	Update exllamav3	2025-11-28 06:45:05 -08:00
oobabooga	3cad0cd4c1	Update llama.cpp	2025-11-28 03:52:37 -08:00
GodEmperor785	400bb0694b	Add slider for --ubatch-size for llama.cpp loader, change defaults for better MoE performance (#7316 )	2025-11-21 16:56:02 -03:00
oobabooga	8f0048663d	More modular HTML generator	2025-11-21 07:09:16 -08:00
oobabooga	b0baf7518b	Remove macos x86-64 portable builds (macos-13 runner deprecated by GitHub)	2025-11-19 06:07:15 -08:00
oobabooga	1afe0827ba	Merge pull request #7317 from oobabooga/dev Merge dev branch	2025-11-19 11:04:02 -03:00
oobabooga	0d4eff284c	Add a --cpu-moe model for llama.cpp	2025-11-19 05:23:43 -08:00
oobabooga	d6f39e1fef	Add ROCm portable builds	2025-11-18 16:32:20 -08:00
oobabooga	327a234d23	Add ROCm requirements.txt files	2025-11-18 16:24:56 -08:00
oobabooga	4e4abd0841	Merge remote-tracking branch 'refs/remotes/origin/dev' into dev	2025-11-18 14:07:05 -08:00
oobabooga	c45f35ccc2	Remove the macos 13 wheels (deprecated by GitHub)	2025-11-18 14:06:42 -08:00
oobabooga	d85b95bb15	Update llama.cpp	2025-11-18 14:06:04 -08:00
dependabot[bot]	4a36b7be5b	Bump triton-windows in /requirements/full (#7311 )	2025-11-18 18:51:26 -03:00
dependabot[bot]	3d7e9856a2	Update peft requirement from ==0.17.* to ==0.18.* in /requirements/full (#7310 )	2025-11-18 18:51:15 -03:00
oobabooga	a26e28bdea	Update exllamav3 to 0.0.15	2025-11-18 11:24:16 -08:00
oobabooga	6a3bf1de92	Update exllamav3 to 0.0.14	2025-11-09 19:43:53 -08:00
oobabooga	9ad9afad7d	Merge pull request #7296 from oobabooga/dev Merge dev branch	2025-11-06 00:38:25 -03:00
oobabooga	e7534a90d8	Update llama.cpp	2025-11-05 18:46:01 -08:00
oobabooga	6be1bfcc87	Remove the CUDA 11.7 portable builds	2025-11-05 05:45:10 -08:00
oobabooga	92d9cd36a6	Update llama.cpp	2025-11-05 05:43:34 -08:00
oobabooga	67f9288891	Pin huggingface-hub to 0.36.0 (solves #7284 and #7289 )	2025-11-02 14:01:00 -08:00
oobabooga	16f77b74c4	Merge remote-tracking branch 'refs/remotes/origin/dev' into dev	2025-11-01 19:58:53 -07:00
oobabooga	cd645f80f8	Update exllamav3 to 0.0.12	2025-11-01 19:58:18 -07:00
Trenten Miller	6871484398	fix: Rename 'evaluation_strategy' to 'eval_strategy' in training	2025-10-28 16:48:04 -03:00
oobabooga	338ae36f73	Add weights_only=True to torch.load in Training_PRO	2025-10-28 12:43:16 -07:00
dependabot[bot]	c8cd840b24	Bump flash-linear-attention from 0.3.2 to 0.4.0 in /requirements/full (#7285 ) Bumps [flash-linear-attention](https://github.com/fla-org/flash-linear-attention) from 0.3.2 to 0.4.0. - [Release notes](https://github.com/fla-org/flash-linear-attention/releases) - [Commits](https://github.com/fla-org/flash-linear-attention/compare/v0.3.2...v0.4.0) --- updated-dependencies: - dependency-name: flash-linear-attention dependency-version: 0.4.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-10-28 10:07:03 -03:00