Merge pull request #7331 from oobabooga/dev

Merge dev branch
fix(deps): upgrade coqui-tts to >=0.27.0 for transformers 4.55 compatibility (#7329 )
2025-12-06 07:12:10 +01:00 · 2025-11-28 23:00:01 -03:00 · 2025-11-28 22:59:36 -03:00 · 2025-11-28 22:48:05 -03:00 · 2025-11-28 06:45:05 -08:00 · 2025-11-28 03:52:37 -08:00
33 changed files with 390 additions and 118 deletions
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@ -41,6 +41,13 @@ jobs:
      version: ${{ inputs.version }}
      config: 'os:ubuntu-22.04'
  build_release_rocm_linux:
    name: ROCm Linux
    uses: ./.github/workflows/build-portable-release-rocm.yml
    with:
      version: ${{ inputs.version }}
      config: 'os:ubuntu-22.04'
  build_release_cpu_windows:
    name: CPU Windows
    uses: ./.github/workflows/build-portable-release.yml
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@ -60,7 +60,7 @@ jobs:
              'os' = @('ubuntu-22.04', 'windows-2022')
              'pyver' = @("3.11")
              'avx' = @("AVX2")
-              'cuda' = @("11.7", "12.4")
+              'cuda' = @("12.4")
          }
          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
@ -147,22 +147,13 @@ jobs:
            # Create CUDA-specific requirements file if needed
            cd "text-generation-webui-${VERSION_CLEAN}"
            if [[ "$CUDA_VERSION" == "11.7" ]]; then
                echo "Creating CUDA 11.7 specific requirements file"
                sed 's/cu124/cu117/g' "$BASE_REQ_FILE" > requirements_cuda_temp.txt
                REQ_FILE="requirements_cuda_temp.txt"
            else
            REQ_FILE="$BASE_REQ_FILE"
            fi
            # 4. Install packages
            echo "Installing Python packages from $REQ_FILE..."
            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
            # 5. Clean up
            if [[ "$CUDA_VERSION" == "11.7" ]]; then
                rm requirements_cuda_temp.txt
            fi
            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
            # 6. Create ZIP file
--- a/.github/workflows/build-portable-release-rocm.yml
+++ b/.github/workflows/build-portable-release-rocm.yml
@ -0,0 +1,165 @@
 name: Build ROCm
 on:
  workflow_dispatch:
    inputs:
      version:
        description: 'Version tag of text-generation-webui to build: v3.0'
        default: 'v3.0'
        required: true
        type: string
      config:
        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
        default: 'Default'
        required: false
        type: string
      exclude:
        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
        default: 'None'
        required: false
        type: string
  workflow_call:
    inputs:
      version:
        description: 'Version tag of text-generation-webui to build: v3.0'
        default: 'v3.0'
        required: true
        type: string
      config:
        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
        default: 'Default'
        required: false
        type: string
      exclude:
        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
        default: 'None'
        required: false
        type: string
 permissions:
  contents: write
 jobs:
  define_matrix:
    name: Define Build Matrix
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    defaults:
      run:
        shell: pwsh
    env:
      CONFIGIN: ${{ inputs.config }}
      EXCLUDEIN: ${{ inputs.exclude }}
    steps:
      - name: Define Job Output
        id: set-matrix
        run: |
          $matrix = @{
              'os' = @('ubuntu-22.04')
              'pyver' = @("3.11")
              'avx' = @("AVX2")
          }
          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
          if ($env:EXCLUDEIN -ne 'None') {
              $exclusions = @()
              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
              $matrix['exclude'] = $exclusions
          }
          $matrixOut = ConvertTo-Json $matrix -Compress
          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
  build_wheels:
    name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }}
    needs: define_matrix
    runs-on: ${{ matrix.os }}
    strategy:
      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
    defaults:
      run:
        shell: pwsh
    env:
      AVXVER: ${{ matrix.avx }}
      PCKGVER: ${{ inputs.version }}
    steps:
      - uses: actions/checkout@v4
        with:
          repository: 'oobabooga/text-generation-webui'
          ref: ${{ inputs.version }}
          submodules: 'recursive'
      - uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.pyver }}
      - name: Build Package
        shell: bash
        run: |
            VERSION_CLEAN="${{ inputs.version }}"
            VERSION_CLEAN="${VERSION_CLEAN#v}"
            cd ..
            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
            cd "text-generation-webui-${VERSION_CLEAN}"
            # Remove extensions that need additional requirements
            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
            # Define common variables
            AVX_SUPPORT="${{ matrix.avx }}"
            VERSION="${{ inputs.version }}"
            # 1. Set platform-specific variables (Linux only for ROCm)
            PLATFORM="linux"
            PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
            PIP_PATH="portable_env/bin/python -m pip"
            PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
            rm start_macos.sh start_windows.bat
            # 2. Download and extract Python
            cd ..
            echo "Downloading Python for $PLATFORM..."
            curl -L -o python-build.tar.gz "$PYTHON_URL"
            tar -xzf python-build.tar.gz
            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
            # 3. Prepare requirements file based on AVX
            if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
                BASE_REQ_FILE="requirements/portable/requirements_amd.txt"
            else
                BASE_REQ_FILE="requirements/portable/requirements_amd_noavx2.txt"
            fi
            REQ_FILE="$BASE_REQ_FILE"
            cd "text-generation-webui-${VERSION_CLEAN}"
            # 4. Install packages
            echo "Installing Python packages from $REQ_FILE..."
            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
            # 5. Clean up
            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
            # 6. Create ZIP file
            cd ..
            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-rocm.zip"
            echo "Creating archive: $ZIP_NAME"
            zip -r "$ZIP_NAME" "text-generation-webui-${VERSION_CLEAN}"
      - name: Upload files to a GitHub release
        id: upload-release
        uses: svenstaro/upload-release-action@2.7.0
        continue-on-error: true
        with:
          repo_token: ${{ secrets.GITHUB_TOKEN }}
          file: ../textgen-portable-*.zip
          tag: ${{ inputs.version }}
          file_glob: true
          make_latest: false
          overwrite: true
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@ -57,7 +57,7 @@ jobs:
        id: set-matrix
        run: |
          $matrix = @{
-              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-13', 'macos-14')
+              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
              'pyver' = @("3.11")
              'avx' = @("AVX2")
          }
--- a/extensions/Training_PRO/script.py
+++ b/extensions/Training_PRO/script.py
@ -823,7 +823,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
        lora_model = get_peft_model(shared.model, config)
        if not always_override and Path(f"{lora_file_path}/adapter_model.bin").is_file():
            logger.info("Loading existing LoRA data...")
-            state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin")
+            state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin", weights_only=True)
            set_peft_model_state_dict(lora_model, state_dict_peft)
            print(f" + Continue Training on {RED}{lora_file_path}/adapter_model.bin{RESET}")
--- a/extensions/coqui_tts/requirements.txt
+++ b/extensions/coqui_tts/requirements.txt
@ -1 +1 @@
-coqui-tts==0.25.1
+coqui-tts>=0.27.0
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@ -196,33 +196,22 @@ def extract_thinking_block(string):
    return None, string
-@functools.lru_cache(maxsize=None)
+def build_thinking_block(thinking_content, message_id, has_remaining_content):
-def convert_to_markdown(string, message_id=None):
+    """Build HTML for a thinking block."""
-    if not string:
+    if thinking_content is None:
-        return ""
+        return None
-    # Use a default message ID if none provided
+    # Process the thinking content through markdown
    if message_id is None:
        message_id = "unknown"
    # Extract thinking block if present
    thinking_content, remaining_content = extract_thinking_block(string)
    # Process the main content
    html_output = process_markdown_content(remaining_content)
    # If thinking content was found, process it using the same function
    if thinking_content is not None:
    thinking_html = process_markdown_content(thinking_content)
    # Generate unique ID for the thinking block
    block_id = f"thinking-{message_id}-0"
    # Check if thinking is complete or still in progress
-        is_streaming = not remaining_content
+    is_streaming = not has_remaining_content
    title_text = "Thinking..." if is_streaming else "Thought"
-        thinking_block = f'''
+    return f'''
    <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
        <summary class="thinking-header">
            {info_svg_small}
@ -232,14 +221,20 @@ def convert_to_markdown(string, message_id=None):
    </details>
    '''
        # Prepend the thinking block to the message HTML
        html_output = thinking_block + html_output
-    return html_output
+def build_main_content_block(content):
    """Build HTML for the main content block."""
    if not content:
        return ""
    return process_markdown_content(content)
 def process_markdown_content(string):
-    """Process a string through the markdown conversion pipeline."""
+    """
    Process a string through the markdown conversion pipeline.
    Uses robust manual parsing to ensure correct LaTeX and Code Block rendering.
    """
    if not string:
        return ""
@ -280,7 +275,7 @@ def process_markdown_content(string):
    pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
    string = pattern.sub(replace_blockquote, string)
-    # Code
+    # Code block standardization
    string = string.replace('\\begin{code}', '```')
    string = string.replace('\\end{code}', '```')
    string = string.replace('\\begin{align*}', '$$')
@ -301,6 +296,7 @@ def process_markdown_content(string):
    is_code = False
    is_latex = False
    # Manual line iteration for robust structure parsing
    for line in string.split('\n'):
        stripped_line = line.strip()
@ -371,6 +367,39 @@ def process_markdown_content(string):
    return html_output
@functools.lru_cache(maxsize=None)
 def convert_to_markdown(string, message_id=None):
    """
    Convert a string to markdown HTML with support for multiple block types.
    Blocks are assembled in order: thinking, main content, etc.
    """
    if not string:
        return ""
    # Use a default message ID if none provided
    if message_id is None:
        message_id = "unknown"
    # Extract different components from the string
    thinking_content, remaining_content = extract_thinking_block(string)
    # Build individual HTML blocks
    blocks = []
    # Add thinking block if present
    thinking_html = build_thinking_block(thinking_content, message_id, bool(remaining_content))
    if thinking_html:
        blocks.append(thinking_html)
    # Add main content block
    main_html = build_main_content_block(remaining_content)
    if main_html:
        blocks.append(main_html)
    # Assemble all blocks into final HTML
    return ''.join(blocks)
 def convert_to_markdown_wrapped(string, message_id=None, use_cache=True):
    '''
    Used to avoid caching convert_to_markdown calls during streaming.
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -317,6 +317,7 @@ class LlamaServer:
            "--ctx-size", str(shared.args.ctx_size),
            "--gpu-layers", str(shared.args.gpu_layers),
            "--batch-size", str(shared.args.batch_size),
            "--ubatch-size", str(shared.args.ubatch_size),
            "--port", str(self.port),
            "--no-webui",
            "--flash-attn", "on",
@ -326,6 +327,8 @@ class LlamaServer:
            cmd += ["--threads", str(shared.args.threads)]
        if shared.args.threads_batch > 0:
            cmd += ["--threads-batch", str(shared.args.threads_batch)]
        if shared.args.cpu_moe:
            cmd.append("--cpu-moe")
        if shared.args.no_mmap:
            cmd.append("--no-mmap")
        if shared.args.mlock:
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -6,9 +6,11 @@ import gradio as gr
 loaders_and_params = OrderedDict({
    'llama.cpp': [
        'gpu_layers',
        'cpu_moe',
        'threads',
        'threads_batch',
        'batch_size',
        'ubatch_size',
        'ctx_size',
        'cache_type',
        'tensor_split',
--- a/modules/shared.py
+++ b/modules/shared.py
@ -66,6 +66,7 @@ group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the pr
 # llama.cpp
 group = parser.add_argument_group('llama.cpp')
 group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
 group.add_argument('--cpu-moe', action='store_true', help='Move the experts to the CPU (for MoE models).')
 group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.')
 group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
 group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
@ -73,7 +74,8 @@ group.add_argument('--row-split', action='store_true', help='Split the model by
 group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
-group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
+group.add_argument('--batch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size.')
 group.add_argument('--ubatch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the max physical batch size for computation (device level).')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
 group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
--- a/modules/training.py
+++ b/modules/training.py
@ -611,7 +611,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
            bf16=shared.args.bf16,
            optim=optimizer,
            logging_steps=2 if stop_at_loss > 0 else 5,
-            evaluation_strategy="steps" if eval_data is not None else "no",
+            eval_strategy="steps" if eval_data is not None else "no",
            eval_steps=math.ceil(eval_steps / gradient_accumulation_steps) if eval_data is not None else None,
            save_strategy="steps" if eval_data is not None else "no",
            output_dir=lora_file_path,
@ -620,7 +620,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
            # TODO: Enable multi-device support
            ddp_find_unused_parameters=None,
            no_cuda=shared.args.cpu,
-            use_ipex=True if is_torch_xpu_available() and not shared.args.cpu else False
+            # use_ipex=True if is_torch_xpu_available() and not shared.args.cpu else False
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
        callbacks=list([Callbacks()])
--- a/modules/ui.py
+++ b/modules/ui.py
@ -125,9 +125,11 @@ def list_model_elements():
        'loader',
        'cpu_memory',
        'gpu_layers',
        'cpu_moe',
        'threads',
        'threads_batch',
        'batch_size',
        'ubatch_size',
        'ctx_size',
        'cache_type',
        'tensor_split',
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -50,6 +50,7 @@ def create_ui():
                        with gr.Column():
                            shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
                            shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
@ -83,6 +84,7 @@ def create_ui():
                                shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
                                shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                                shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
                                shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
                                shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                                shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
                                shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
@ -94,7 +96,7 @@ def create_ui():
                                shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
                            with gr.Column():
-                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
+                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='Use PyTorch in CPU mode.')
                                shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                                shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
                                shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@ -5,13 +5,14 @@ colorama
 datasets
 einops
 fastapi==0.112.4
-flash-linear-attention==0.3.2
+flash-linear-attention==0.4.0
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -25,7 +26,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb
@ -39,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb
@ -37,7 +38,7 @@ sse-starlette==1.6.5
 tiktoken
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb
@ -37,7 +38,7 @@ sse-starlette==1.6.5
 tiktoken
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb
@ -37,5 +38,5 @@ sse-starlette==1.6.5
 tiktoken
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb
@ -37,6 +38,5 @@ sse-starlette==1.6.5
 tiktoken
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb
@ -37,5 +38,5 @@ sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb
@ -37,5 +38,5 @@ sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@ -5,13 +5,14 @@ colorama
 datasets
 einops
 fastapi==0.112.4
-flash-linear-attention==0.3.2
+flash-linear-attention==0.4.0
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -25,7 +26,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb
@ -39,10 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@ -5,11 +5,12 @@ datasets
 einops
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.17.*
+peft==0.18.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -23,7 +24,7 @@ scipy
 sentencepiece
 tensorboard
 transformers==4.57.*
-triton-windows==3.5.0.post21; platform_system == "Windows"
+triton-windows==3.5.1.post21; platform_system == "Windows"
 tqdm
 wandb
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@ -0,0 +1,27 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pydantic==2.11.0
 PyPDF2==3.0.1
 python-docx==1.1.2
 pyyaml
 requests
 rich
 tqdm
 # Gradio
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@ -0,0 +1,27 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pydantic==2.11.0
 PyPDF2==3.0.1
 python-docx==1.1.2
 pyyaml
 requests
 rich
 tqdm
 # Gradio
 gradio==4.37.*
 https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,6 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,6 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -21,6 +22,6 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
-# CUDA wheels
+# Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@ -1,6 +1,7 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
 huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
Author	SHA1	Message	Date
oobabooga	bd9f2de73a	Merge pull request #7331 from oobabooga/dev Merge dev branch	2025-11-28 23:00:01 -03:00
aidevtime	661e42d2b7	fix(deps): upgrade coqui-tts to >=0.27.0 for transformers 4.55 compatibility (#7329 )	2025-11-28 22:59:36 -03:00
oobabooga	5327bc9397	Update modules/shared.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>	2025-11-28 22:48:05 -03:00
oobabooga	78b315344a	Update exllamav3	2025-11-28 06:45:05 -08:00
oobabooga	3cad0cd4c1	Update llama.cpp	2025-11-28 03:52:37 -08:00
GodEmperor785	400bb0694b	Add slider for --ubatch-size for llama.cpp loader, change defaults for better MoE performance (#7316 )	2025-11-21 16:56:02 -03:00
oobabooga	8f0048663d	More modular HTML generator	2025-11-21 07:09:16 -08:00
oobabooga	b0baf7518b	Remove macos x86-64 portable builds (macos-13 runner deprecated by GitHub)	2025-11-19 06:07:15 -08:00
oobabooga	1afe0827ba	Merge pull request #7317 from oobabooga/dev Merge dev branch	2025-11-19 11:04:02 -03:00
oobabooga	0d4eff284c	Add a --cpu-moe model for llama.cpp	2025-11-19 05:23:43 -08:00
oobabooga	d6f39e1fef	Add ROCm portable builds	2025-11-18 16:32:20 -08:00
oobabooga	327a234d23	Add ROCm requirements.txt files	2025-11-18 16:24:56 -08:00
oobabooga	4e4abd0841	Merge remote-tracking branch 'refs/remotes/origin/dev' into dev	2025-11-18 14:07:05 -08:00
oobabooga	c45f35ccc2	Remove the macos 13 wheels (deprecated by GitHub)	2025-11-18 14:06:42 -08:00
oobabooga	d85b95bb15	Update llama.cpp	2025-11-18 14:06:04 -08:00
dependabot[bot]	4a36b7be5b	Bump triton-windows in /requirements/full (#7311 )	2025-11-18 18:51:26 -03:00
dependabot[bot]	3d7e9856a2	Update peft requirement from ==0.17.* to ==0.18.* in /requirements/full (#7310 )	2025-11-18 18:51:15 -03:00
oobabooga	a26e28bdea	Update exllamav3 to 0.0.15	2025-11-18 11:24:16 -08:00
oobabooga	6a3bf1de92	Update exllamav3 to 0.0.14	2025-11-09 19:43:53 -08:00
oobabooga	9ad9afad7d	Merge pull request #7296 from oobabooga/dev Merge dev branch	2025-11-06 00:38:25 -03:00
oobabooga	e7534a90d8	Update llama.cpp	2025-11-05 18:46:01 -08:00
oobabooga	6be1bfcc87	Remove the CUDA 11.7 portable builds	2025-11-05 05:45:10 -08:00
oobabooga	92d9cd36a6	Update llama.cpp	2025-11-05 05:43:34 -08:00
oobabooga	67f9288891	Pin huggingface-hub to 0.36.0 (solves #7284 and #7289 )	2025-11-02 14:01:00 -08:00
oobabooga	16f77b74c4	Merge remote-tracking branch 'refs/remotes/origin/dev' into dev	2025-11-01 19:58:53 -07:00
oobabooga	cd645f80f8	Update exllamav3 to 0.0.12	2025-11-01 19:58:18 -07:00
Trenten Miller	6871484398	fix: Rename 'evaluation_strategy' to 'eval_strategy' in training	2025-10-28 16:48:04 -03:00
oobabooga	338ae36f73	Add weights_only=True to torch.load in Training_PRO	2025-10-28 12:43:16 -07:00
dependabot[bot]	c8cd840b24	Bump flash-linear-attention from 0.3.2 to 0.4.0 in /requirements/full (#7285 ) Bumps [flash-linear-attention](https://github.com/fla-org/flash-linear-attention) from 0.3.2 to 0.4.0. - [Release notes](https://github.com/fla-org/flash-linear-attention/releases) - [Commits](https://github.com/fla-org/flash-linear-attention/compare/v0.3.2...v0.4.0) --- updated-dependencies: - dependency-name: flash-linear-attention dependency-version: 0.4.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-10-28 10:07:03 -03:00