2026-01-05 16:20:04 +01:00
49 changed files with 242 additions and 2175 deletions
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@ -41,13 +41,6 @@ jobs:
      version: ${{ inputs.version }}
      config: 'os:ubuntu-22.04'

-  build_release_rocm_linux:
-    name: ROCm Linux
-    uses: ./.github/workflows/build-portable-release-rocm.yml
-    with:
-      version: ${{ inputs.version }}
-      config: 'os:ubuntu-22.04'
-
  build_release_cpu_windows:
    name: CPU Windows
    uses: ./.github/workflows/build-portable-release.yml
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@ -60,7 +60,7 @@ jobs:
              'os' = @('ubuntu-22.04', 'windows-2022')
              'pyver' = @("3.11")
              'avx' = @("AVX2")
-              'cuda' = @("12.4")
+              'cuda' = @("11.7", "12.4")
          }

          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
@ -147,13 +147,22 @@ jobs:

            # Create CUDA-specific requirements file if needed
            cd "text-generation-webui-${VERSION_CLEAN}"
-            REQ_FILE="$BASE_REQ_FILE"
+            if [[ "$CUDA_VERSION" == "11.7" ]]; then
+                echo "Creating CUDA 11.7 specific requirements file"
+                sed 's/cu124/cu117/g' "$BASE_REQ_FILE" > requirements_cuda_temp.txt
+                REQ_FILE="requirements_cuda_temp.txt"
+            else
+                REQ_FILE="$BASE_REQ_FILE"
+            fi

            # 4. Install packages
            echo "Installing Python packages from $REQ_FILE..."
            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"

            # 5. Clean up
+            if [[ "$CUDA_VERSION" == "11.7" ]]; then
+                rm requirements_cuda_temp.txt
+            fi
            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py

            # 6. Create ZIP file
--- a/.github/workflows/build-portable-release-rocm.yml
+++ b/.github/workflows/build-portable-release-rocm.yml
@ -1,165 +0,0 @@
-name: Build ROCm
-
-on:
-  workflow_dispatch:
-    inputs:
-      version:
-        description: 'Version tag of text-generation-webui to build: v3.0'
-        default: 'v3.0'
-        required: true
-        type: string
-      config:
-        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
-        default: 'Default'
-        required: false
-        type: string
-      exclude:
-        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
-        default: 'None'
-        required: false
-        type: string
-  workflow_call:
-    inputs:
-      version:
-        description: 'Version tag of text-generation-webui to build: v3.0'
-        default: 'v3.0'
-        required: true
-        type: string
-      config:
-        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
-        default: 'Default'
-        required: false
-        type: string
-      exclude:
-        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
-        default: 'None'
-        required: false
-        type: string
-
-permissions:
-  contents: write
-
-jobs:
-  define_matrix:
-    name: Define Build Matrix
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    defaults:
-      run:
-        shell: pwsh
-    env:
-      CONFIGIN: ${{ inputs.config }}
-      EXCLUDEIN: ${{ inputs.exclude }}
-
-    steps:
-      - name: Define Job Output
-        id: set-matrix
-        run: |
-          $matrix = @{
-              'os' = @('ubuntu-22.04')
-              'pyver' = @("3.11")
-              'avx' = @("AVX2")
-          }
-
-          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
-
-          if ($env:EXCLUDEIN -ne 'None') {
-              $exclusions = @()
-              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
-              $matrix['exclude'] = $exclusions
-          }
-
-          $matrixOut = ConvertTo-Json $matrix -Compress
-          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
-
-  build_wheels:
-    name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }}
-    needs: define_matrix
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
-    defaults:
-      run:
-        shell: pwsh
-    env:
-      AVXVER: ${{ matrix.avx }}
-      PCKGVER: ${{ inputs.version }}
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          repository: 'oobabooga/text-generation-webui'
-          ref: ${{ inputs.version }}
-          submodules: 'recursive'
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.pyver }}
-
-      - name: Build Package
-        shell: bash
-        run: |
-            VERSION_CLEAN="${{ inputs.version }}"
-            VERSION_CLEAN="${VERSION_CLEAN#v}"
-            cd ..
-            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
-            cd "text-generation-webui-${VERSION_CLEAN}"
-
-            # Remove extensions that need additional requirements
-            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
-            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
-
-            # Define common variables
-            AVX_SUPPORT="${{ matrix.avx }}"
-            VERSION="${{ inputs.version }}"
-
-            # 1. Set platform-specific variables (Linux only for ROCm)
-            PLATFORM="linux"
-            PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
-            PIP_PATH="portable_env/bin/python -m pip"
-            PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
-            rm start_macos.sh start_windows.bat
-
-            # 2. Download and extract Python
-            cd ..
-            echo "Downloading Python for $PLATFORM..."
-            curl -L -o python-build.tar.gz "$PYTHON_URL"
-            tar -xzf python-build.tar.gz
-            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
-
-            # 3. Prepare requirements file based on AVX
-            if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
-                BASE_REQ_FILE="requirements/portable/requirements_amd.txt"
-            else
-                BASE_REQ_FILE="requirements/portable/requirements_amd_noavx2.txt"
-            fi
-            REQ_FILE="$BASE_REQ_FILE"
-
-            cd "text-generation-webui-${VERSION_CLEAN}"
-
-            # 4. Install packages
-            echo "Installing Python packages from $REQ_FILE..."
-            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
-
-            # 5. Clean up
-            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
-
-            # 6. Create ZIP file
-            cd ..
-            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-rocm.zip"
-            echo "Creating archive: $ZIP_NAME"
-
-            zip -r "$ZIP_NAME" "text-generation-webui-${VERSION_CLEAN}"
-
-      - name: Upload files to a GitHub release
-        id: upload-release
-        uses: svenstaro/upload-release-action@2.7.0
-        continue-on-error: true
-        with:
-          repo_token: ${{ secrets.GITHUB_TOKEN }}
-          file: ../textgen-portable-*.zip
-          tag: ${{ inputs.version }}
-          file_glob: true
-          make_latest: false
-          overwrite: true
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@ -57,7 +57,7 @@ jobs:
        id: set-matrix
        run: |
          $matrix = @{
-              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
+              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-13', 'macos-14')
              'pyver' = @("3.11")
              'avx' = @("AVX2")
          }
--- a/README.md
+++ b/README.md
@ -21,10 +21,6 @@ A Gradio web UI for Large Language Models.
 |:---:|:---:|
 |![Image1](https://github.com/oobabooga/screenshots/raw/main/DEFAULT-3.5.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/PARAMETERS-3.5.png) |

-## 🔥 News
-
- The project now supports **image generation**! Including Z-Image-Turbo, 4bit/8bit quantization, `torch.compile`, and LLM-generated prompt variations ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Image-Generation-Tutorial)).
-
 ## Features

 - Supports multiple local text generation backends, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
@ -32,7 +28,6 @@ A Gradio web UI for Large Language Models.
 - 100% offline and private, with zero telemetry, external resources, or remote update requests.
 - **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
 - **Vision (multimodal models)**: Attach images to messages for visual understanding ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Multimodal-Tutorial)).
- **Image generation**: A dedicated tab for `diffusers` models like **Z-Image-Turbo**. Features 4-bit/8-bit quantization and a persistent gallery with metadata ([tutorial](https://github.com/oobabooga/text-generation-webui/wiki/Image-Generation-Tutorial)).
 - **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
 - Aesthetic UI with dark and light themes.
 - Syntax highlighting for code blocks and LaTeX rendering for mathematical expressions.
@ -437,7 +432,6 @@ https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/ma

 https://www.reddit.com/r/Oobabooga/

-## Acknowledgments
+## Acknowledgment

- In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
- This project was inspired by [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) and wouldn't exist without it.
+In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
--- a/css/main.css
+++ b/css/main.css
@ -93,11 +93,11 @@ ol li p, ul li p {
    display: inline-block;
 }

-#notebook-parent-tab, #chat-tab, #parameters, #chat-settings, #lora, #training-tab, #model-tab, #session-tab, #character-tab, #image-ai-tab {
+#notebook-parent-tab, #chat-tab, #parameters, #chat-settings, #lora, #training-tab, #model-tab, #session-tab, #character-tab {
    border: 0;
 }

-#notebook-parent-tab, #parameters, #chat-settings, #lora, #training-tab, #model-tab, #session-tab, #character-tab, #image-ai-tab {
+#notebook-parent-tab, #parameters, #chat-settings, #lora, #training-tab, #model-tab, #session-tab, #character-tab {
    padding: 1rem;
 }

@ -244,46 +244,37 @@ button {
    font-size: 100% !important;
 }

-.pretty_scrollbar::-webkit-scrollbar,
-#image-history-gallery > :nth-child(2)::-webkit-scrollbar {
+.pretty_scrollbar::-webkit-scrollbar {
    width: 8px;
    height: 8px;
 }

-.pretty_scrollbar::-webkit-scrollbar-track,
-#image-history-gallery > :nth-child(2)::-webkit-scrollbar-track {
+.pretty_scrollbar::-webkit-scrollbar-track {
    background: transparent;
 }

 .pretty_scrollbar::-webkit-scrollbar-thumb,
-.pretty_scrollbar::-webkit-scrollbar-thumb:hover,
-#image-history-gallery > :nth-child(2)::-webkit-scrollbar-thumb,
-#image-history-gallery > :nth-child(2)::-webkit-scrollbar-thumb:hover {
+.pretty_scrollbar::-webkit-scrollbar-thumb:hover {
    background: var(--neutral-300);
    border-radius: 30px;
 }

 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
-.dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover,
-.dark #image-history-gallery > :nth-child(2)::-webkit-scrollbar-thumb,
-.dark #image-history-gallery > :nth-child(2)::-webkit-scrollbar-thumb:hover {
+.dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
    background: rgb(255 255 255 / 6.25%);
    border-radius: 10px;
 }

-.pretty_scrollbar::-webkit-resizer,
-#image-history-gallery > :nth-child(2)::-webkit-resizer {
+.pretty_scrollbar::-webkit-resizer {
    background: #c5c5d2;
 }

-.dark .pretty_scrollbar::-webkit-resizer,
-.dark #image-history-gallery > :nth-child(2)::-webkit-resizer {
+.dark .pretty_scrollbar::-webkit-resizer {
    background: #ccc;
    border-radius: 10px;
 }

-.pretty_scrollbar::-webkit-scrollbar-corner,
-#image-history-gallery > :nth-child(2)::-webkit-scrollbar-corner {
+.pretty_scrollbar::-webkit-scrollbar-corner {
    background: transparent;
 }

@ -1683,117 +1674,3 @@ button:focus {
 .dark .sidebar-vertical-separator {
    border-bottom: 1px solid rgb(255 255 255 / 10%);
 }
-
-button#swap-height-width {
-    position: absolute;
-    top: -50px;
-    right: 0;
-    border: 0;
-}
-
-#image-output-gallery, #image-output-gallery > :nth-child(2) {
-    height: calc(100vh - 83px);
-    max-height: calc(100vh - 83px);
-}
-
-#image-history-gallery, #image-history-gallery > :nth-child(2) {
-    height: calc(100vh - 174px);
-    max-height: calc(100vh - 174px);
-}
-
-/* Additional CSS for the paginated image gallery */
-
-/* Page info styling */
-#image-page-info {
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    min-width: 200px;
-    font-size: 0.9em;
-    color: var(--body-text-color-subdued);
-}
-
-/* Settings display panel */
-#image-ai-tab .settings-display-panel {
-    background: var(--background-fill-secondary);
-    padding: 12px;
-    border-radius: 8px;
-    font-size: 0.9em;
-    max-height: 300px;
-    overflow-y: auto;
-    margin-top: 8px;
-}
-
-/* Gallery status message */
-#image-ai-tab .gallery-status {
-    color: var(--color-accent);
-    font-size: 0.85em;
-    margin-top: 4px;
-}
-
-/* Pagination button row alignment */
-#image-ai-tab .pagination-controls {
-    display: flex;
-    align-items: center;
-    gap: 8px;
-    flex-wrap: wrap;
-}
-
-/* Selected image preview container */
-#image-ai-tab .selected-preview-container {
-    border: 1px solid var(--border-color-primary);
-    border-radius: 8px;
-    padding: 8px;
-    background: var(--background-fill-secondary);
-}
-
-/* Fix a gr.Markdown UI glitch when clicking Next in the
- * Image AI > Gallery tab */
-.min.svelte-1yrv54 {
-    min-height: 0;
-}
-
-/* Image Generation Progress Bar */
-#image-progress .image-ai-separator {
-    height: 24px;
-    margin: 20px 0;
-    border-top: 1px solid var(--input-border-color);
-}
-
-#image-progress .image-ai-progress-wrapper {
-    height: 24px;
-    margin: 20px 0;
-}
-
-#image-progress .image-ai-progress-track {
-    background: #e5e7eb;
-    border-radius: 4px;
-    overflow: hidden;
-    height: 8px;
-}
-
-.dark #image-progress .image-ai-progress-track {
-    background: #333;
-}
-
-#image-progress .image-ai-progress-fill {
-    background: #4a9eff;
-    height: 100%;
-}
-
-#image-progress .image-ai-progress-text {
-    text-align: center;
-    font-size: 12px;
-    color: #666;
-    margin-top: 4px;
-}
-
-.dark #image-progress .image-ai-progress-text {
-    color: #888;
-}
-
-#llm-prompt-variations {
-    position: absolute;
-    top: 0;
-    left: calc(100% - 174px);
-}
--- a/docs/12
+++ b/docs/12
@ -139,35 +139,6 @@ curl http://127.0.0.1:5000/v1/completions \

 For base64-encoded images, just replace the inner "url" values with this format: `data:image/FORMAT;base64,BASE64_STRING` where FORMAT is the file type (png, jpeg, gif, etc.) and BASE64_STRING is your base64-encoded image data.

-#### Image generation
-
-```shell
-curl http://127.0.0.1:5000/v1/images/generations \
-  -H "Content-Type: application/json" \
-  -d '{
-    "prompt": "an orange tree",
-    "steps": 9,
-    "cfg_scale": 0,
-    "batch_size": 1,
-    "batch_count": 1
-  }'
-```
-
-You need to load an image model first. You can do this via the UI, or by adding `--image-model your_model_name` when launching the server.
-
-The output is a JSON object containing a `data` array. Each element has a `b64_json` field with the base64-encoded PNG image:
-
-```json
-{
-  "created": 1764791227,
-  "data": [
-    {
-      "b64_json": "iVBORw0KGgo..."
-    }
-  ]
-}
-```
-
 #### SSE streaming

 ```shell
@ -448,6 +419,7 @@ The following environment variables can be used (they take precedence over every
 | `OPENEDAI_CERT_PATH`      | SSL certificate file path         |            cert.pem                |
 | `OPENEDAI_KEY_PATH`       | SSL key file path                    |             key.pem               |
 | `OPENEDAI_DEBUG`          | Enable debugging (set to 1)    | 1                          |
+| `SD_WEBUI_URL`           | WebUI URL (used by endpoint) | http://127.0.0.1:7861 |
 | `OPENEDAI_EMBEDDING_MODEL` | Embedding model (if applicable) |          sentence-transformers/all-mpnet-base-v2                  |
 | `OPENEDAI_EMBEDDING_DEVICE` | Embedding device (if applicable) |           cuda                 |

@ -458,6 +430,7 @@ You can also set the following variables in your `settings.yaml` file:
 ```
 openai-embedding_device: cuda
 openai-embedding_model: "sentence-transformers/all-mpnet-base-v2"
+openai-sd_webui_url: http://127.0.0.1:7861
 openai-debug: 1
 ```

--- a/Tutorial.md
+++ b/Tutorial.md
@ -1,98 +0,0 @@
-# Image Generation Tutorial
-
-This feature allows you to generate images using `diffusers` models like [Tongyi-MAI/Z-Image-Turbo](https://huggingface.co/Tongyi-MAI/Z-Image-Turbo) directly within the web UI.
-
-<img alt="print" src="https://github.com/user-attachments/assets/5108de50-658b-4e93-b2ae-4656d076bc9d" />
-
-
-## Installation
-
-1. Clone the repository with
-
-```
-git clone https://github.com/oobabooga/text-generation-webui
-```
-
-or download it from [here](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip) and unzip it.
-
-2. Use the one-click installer.
-
- Windows: Double click on `start_windows.bat`
- Linux: Run `./start_linux.sh`
- macOS: Run `./start_macos.sh`
-
-Note: Image generation does not work with the portable builds in `.zip` format in the [Releases page](https://github.com/oobabooga/text-generation-webui/releases). You need the "full" version of the web UI.
-
-## Downloading a model
-
-1. Once installation ends, browse to `http://127.0.0.1:7860/`.
-2. Click on "Image AI" on the left.
-3. Click on "Model" at the top.
-4. In the "Download model" field, paste `https://huggingface.co/Tongyi-MAI/Z-Image-Turbo` and click "Download".
-5. Wait for the download to finish (it's 31 GB).
-
-## Loading the model
-
-Select the quantization option in the "Quantization" menu and click "Load".
-
-The memory usage for `Z-Image-Turbo` for each option is:
-
-| Quantization Method | VRAM Usage |
-| :--- | :--- |
-| None (FP16/BF16) | 25613 MiB |
-| bnb-8bit | 16301 MiB |
-| bnb-8bit + CPU Offload | 16235 MiB |
-| bnb-4bit | 11533 MiB |
-| bnb-4bit + CPU Offload | 7677 MiB |
-
-The `torchao` options support `torch.compile` for faster image generation, with `float8wo` specifically providing native hardware acceleration for RTX 40-series and newer GPUs.
-
-Note: The next time you launch the web UI, the model will get automatically loaded with your last settings when you try to generate an image. You do not need to go to the Model tab and click "Load" each time.
-
-## Generating images:
-
-1. While still in the "Image AI" page, go to the "Generate" tab.
-2. Type your prompt and click on the Generate button.
-
-### Model-specific settings
-
- For Z-Image-Turbo, make sure to keep CFG Scale at 0 and Steps at 9. Do not write a Negative Prompt as it will get ignored with this CFG Scale value.
-
-### LLM Prompt Variations
-
-To use this feature, you need to load an LLM in the main "Model" page on the left.
-
-If you have no idea what to use, do this to get started:
-
-1. Download [Qwen3-4B-Q3_K_M.gguf](https://huggingface.co/unsloth/Qwen3-4B-GGUF/resolve/main/Qwen3-4B-Q3_K_M.gguf) to your `text-generation-webui/user_data/models` folder.
-2. Select the model in the dropdown menu in the "Model" page.
-3. Click Load.
-
-Then go back to the "Image AI" page and check "LLM Prompt Variations".
-
-After that, your prompts will be automatically updated by the LLM each time you generate an image. If you use a "Sequential Count" value greater than 1, a new prompt will be created for each sequential batch.
-
-The improvement in creativity is striking (prompt: `Photo of a beautiful woman at night under moonlight`):
-
-<img  alt="comparison_collage" src="https://github.com/user-attachments/assets/67884832-2800-41cb-a146-e88e25af89c4" />
-
-## Generating images over API
-
-It is possible to generate images using the project's API. Just make sure to start the server with `--api`, either by
-
-1. Passing the `--api` flag to your `start` script, like `./start_linux.sh --api`, or
-2. Writing `--api` to your `user_data/CMD_FLAGS.txt` file and relaunching the web UI.
-
-Here is an API call example:
-
-```
-curl http://127.0.0.1:5000/v1/images/generations \
-  -H "Content-Type: application/json" \
-  -d '{
-    "prompt": "an orange tree",
-    "steps": 9,
-    "cfg_scale": 0,
-    "batch_size": 1,
-    "batch_count": 1
-  }'
-```
--- a/extensions/Training_PRO/script.py
+++ b/extensions/Training_PRO/script.py
@ -823,7 +823,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
        lora_model = get_peft_model(shared.model, config)
        if not always_override and Path(f"{lora_file_path}/adapter_model.bin").is_file():
            logger.info("Loading existing LoRA data...")
-            state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin", weights_only=True)
+            state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin")
            set_peft_model_state_dict(lora_model, state_dict_peft)

            print(f" + Continue Training on {RED}{lora_file_path}/adapter_model.bin{RESET}")
--- a/extensions/coqui_tts/requirements.txt
+++ b/extensions/coqui_tts/requirements.txt
@ -1 +1 @@
-coqui-tts>=0.27.0
+coqui-tts==0.25.1
--- a/extensions/openai/images.py
+++ b/extensions/openai/images.py
@ -1,69 +1,70 @@
-"""
-OpenAI-compatible image generation using local diffusion models.
-"""
-
-import base64
-import io
+import os
 import time

+import requests
+
 from extensions.openai.errors import ServiceUnavailableError
-from modules import shared


-def generations(request):
-    """
-    Generate images using the loaded diffusion model.
-    Returns dict with 'created' timestamp and 'data' list of images.
-    """
-    from modules.ui_image_generation import generate
+def generations(prompt: str, size: str, response_format: str, n: int):
+    # Stable Diffusion callout wrapper for txt2img
+    # Low effort implementation for compatibility. With only "prompt" being passed and assuming DALL-E
+    # the results will be limited and likely poor. SD has hundreds of models and dozens of settings.
+    # If you want high quality tailored results you should just use the Stable Diffusion API directly.
+    # it's too general an API to try and shape the result with specific tags like negative prompts
+    # or "masterpiece", etc. SD configuration is beyond the scope of this API.
+    # At this point I will not add the edits and variations endpoints (ie. img2img) because they
+    # require changing the form data handling to accept multipart form data, also to properly support
+    # url return types will require file management and a web serving files... Perhaps later!
+    base_model_size = 512 if 'SD_BASE_MODEL_SIZE' not in os.environ else int(os.environ.get('SD_BASE_MODEL_SIZE', 512))
+    sd_defaults = {
+        'sampler_name': 'DPM++ 2M Karras',  # vast improvement
+        'steps': 30,
+    }

-    if shared.image_model is None:
-        raise ServiceUnavailableError("No image model loaded. Load a model via the UI first.")
+    width, height = [int(x) for x in size.split('x')]  # ignore the restrictions on size

-    width, height = request.get_width_height()
+    # to hack on better generation, edit default payload.
+    payload = {
+        'prompt': prompt,  # ignore prompt limit of 1000 characters
+        'width': width,
+        'height': height,
+        'batch_size': n,
+    }
+    payload.update(sd_defaults)

-    # Build state dict: GenerationOptions fields + image-specific keys
-    state = request.model_dump()
-    state.update({
-        'image_model_menu': shared.image_model_name,
-        'image_prompt': request.prompt,
-        'image_neg_prompt': request.negative_prompt,
-        'image_width': width,
-        'image_height': height,
-        'image_steps': request.steps,
-        'image_seed': request.image_seed,
-        'image_batch_size': request.batch_size,
-        'image_batch_count': request.batch_count,
-        'image_cfg_scale': request.cfg_scale,
-        'image_llm_variations': False,
-    })
+    scale = min(width, height) / base_model_size
+    if scale >= 1.2:
+        # for better performance with the default size (1024), and larger res.
+        scaler = {
+            'width': width // scale,
+            'height': height // scale,
+            'hr_scale': scale,
+            'enable_hr': True,
+            'hr_upscaler': 'Latent',
+            'denoising_strength': 0.68,
+        }
+        payload.update(scaler)

-    # Exhaust generator, keep final result
-    images = []
-    for images, _ in generate(state, save_images=False):
-        pass
+    resp = {
+        'created': int(time.time()),
+        'data': []
+    }
+    from extensions.openai.script import params

-    if not images:
-        raise ServiceUnavailableError("Image generation failed or produced no images.")
+    # TODO: support SD_WEBUI_AUTH username:password pair.
+    sd_url = f"{os.environ.get('SD_WEBUI_URL', params.get('sd_webui_url', ''))}/sdapi/v1/txt2img"

-    # Build response
-    resp = {'created': int(time.time()), 'data': []}
-    for img in images:
-        b64 = _image_to_base64(img)
-
-        image_obj = {'revised_prompt': request.prompt}
-
-        if request.response_format == 'b64_json':
-            image_obj['b64_json'] = b64
+    response = requests.post(url=sd_url, json=payload)
+    r = response.json()
+    if response.status_code != 200 or 'images' not in r:
+        print(r)
+        raise ServiceUnavailableError(r.get('error', 'Unknown error calling Stable Diffusion'), code=response.status_code, internal_message=r.get('errors', None))
+    # r['parameters']...
+    for b64_json in r['images']:
+        if response_format == 'b64_json':
+            resp['data'].extend([{'b64_json': b64_json}])
        else:
-            image_obj['url'] = f'data:image/png;base64,{b64}'
-
-        resp['data'].append(image_obj)
+            resp['data'].extend([{'url': f'data:image/png;base64,{b64_json}'}])  # yeah it's lazy. requests.get() will not work with this

    return resp
-
-
-def _image_to_base64(image) -> str:
-    buffered = io.BytesIO()
-    image.save(buffered, format="PNG")
-    return base64.b64encode(buffered.getvalue()).decode('utf-8')
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@ -17,8 +17,10 @@ from sse_starlette import EventSourceResponse
 from starlette.concurrency import iterate_in_threadpool

 import extensions.openai.completions as OAIcompletions
+import extensions.openai.images as OAIimages
 import extensions.openai.logits as OAIlogits
 import extensions.openai.models as OAImodels
+from extensions.openai.errors import ServiceUnavailableError
 from extensions.openai.tokens import token_count, token_decode, token_encode
 from extensions.openai.utils import _start_cloudflared
 from modules import shared
@ -38,8 +40,6 @@ from .typing import (
    EmbeddingsResponse,
    EncodeRequest,
    EncodeResponse,
-    ImageGenerationRequest,
-    ImageGenerationResponse,
    LoadLorasRequest,
    LoadModelRequest,
    LogitsRequest,
@ -54,12 +54,12 @@ from .typing import (
 params = {
    'embedding_device': 'cpu',
    'embedding_model': 'sentence-transformers/all-mpnet-base-v2',
+    'sd_webui_url': '',
    'debug': 0
 }


 streaming_semaphore = asyncio.Semaphore(1)
-image_generation_semaphore = asyncio.Semaphore(1)


 def verify_api_key(authorization: str = Header(None)) -> None:
@ -228,13 +228,20 @@ async def handle_audio_transcription(request: Request):
    return JSONResponse(content=transcription)


-@app.post('/v1/images/generations', response_model=ImageGenerationResponse, dependencies=check_key)
-async def handle_image_generation(request_data: ImageGenerationRequest):
-    import extensions.openai.images as OAIimages
+@app.post('/v1/images/generations', dependencies=check_key)
+async def handle_image_generation(request: Request):

-    async with image_generation_semaphore:
-        response = await asyncio.to_thread(OAIimages.generations, request_data)
-        return JSONResponse(response)
+    if not os.environ.get('SD_WEBUI_URL', params.get('sd_webui_url', '')):
+        raise ServiceUnavailableError("Stable Diffusion not available. SD_WEBUI_URL not set.")
+
+    body = await request.json()
+    prompt = body['prompt']
+    size = body.get('size', '1024x1024')
+    response_format = body.get('response_format', 'url')  # or b64_json
+    n = body.get('n', 1)  # ignore the batch limits of max 10
+
+    response = await OAIimages.generations(prompt=prompt, size=size, response_format=response_format, n=n)
+    return JSONResponse(response)


@app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key)
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@ -130,7 +130,7 @@ class CompletionRequest(GenerationOptions, CompletionRequestParams):
 class CompletionResponse(BaseModel):
    id: str
    choices: List[dict]
-    created: int = Field(default_factory=lambda: int(time.time()))
+    created: int = int(time.time())
    model: str
    object: str = "text_completion"
    usage: dict
@ -178,7 +178,7 @@ class ChatCompletionRequest(GenerationOptions, ChatCompletionRequestParams):
 class ChatCompletionResponse(BaseModel):
    id: str
    choices: List[dict]
-    created: int = Field(default_factory=lambda: int(time.time()))
+    created: int = int(time.time())
    model: str
    object: str = "chat.completion"
    usage: dict
@ -264,42 +264,6 @@ class LoadLorasRequest(BaseModel):
    lora_names: List[str]


-class ImageGenerationRequest(BaseModel):
-    """Image-specific parameters for generation."""
-    prompt: str
-    negative_prompt: str = ""
-    size: str = Field(default="1024x1024", description="'WIDTHxHEIGHT'")
-    steps: int = Field(default=9, ge=1)
-    cfg_scale: float = Field(default=0.0, ge=0.0)
-    image_seed: int = Field(default=-1, description="-1 for random")
-    batch_size: int | None = Field(default=None, ge=1, description="Parallel batch size (VRAM heavy)")
-    n: int = Field(default=1, ge=1, description="Alias for batch_size (OpenAI compatibility)")
-    batch_count: int = Field(default=1, ge=1, description="Sequential batch count")
-
-    # OpenAI compatibility (unused)
-    model: str | None = None
-    response_format: str = "b64_json"
-    user: str | None = None
-
-    @model_validator(mode='after')
-    def resolve_batch_size(self):
-        if self.batch_size is None:
-            self.batch_size = self.n
-        return self
-
-    def get_width_height(self) -> tuple[int, int]:
-        try:
-            parts = self.size.lower().split('x')
-            return int(parts[0]), int(parts[1])
-        except (ValueError, IndexError):
-            return 1024, 1024
-
-
-class ImageGenerationResponse(BaseModel):
-    created: int = Field(default_factory=lambda: int(time.time()))
-    data: List[dict]
-
-
 def to_json(obj):
    return json.dumps(obj.__dict__, indent=4)

--- a/js/switch_tabs.js
+++ b/js/switch_tabs.js
@ -36,17 +36,3 @@ function switch_to_character() {
  document.getElementById("character-tab-button").click();
  scrollToTop();
 }
-
-function switch_to_image_ai_generate() {
-  const container = document.querySelector("#image-ai-tab");
-  const buttons = container.getElementsByTagName("button");
-
-  for (let i = 0; i < buttons.length; i++) {
-    if (buttons[i].textContent.trim() === "Generate") {
-      buttons[i].click();
-      break;
-    }
-  }
-
-  scrollToTop();
-}
--- a/modules/chat.py
+++ b/modules/chat.py
@ -3,6 +3,7 @@ import copy
 import functools
 import html
 import json
+import os
 import pprint
 import re
 import shutil
@ -25,7 +26,6 @@ from modules.html_generator import (
    convert_to_markdown,
    make_thumbnail
 )
-from modules.image_utils import open_image_safely
 from modules.logging_colors import logger
 from modules.text_generation import (
    generate_reply,
@ -112,9 +112,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
        add_generation_prompt=False,
        enable_thinking=state['enable_thinking'],
        reasoning_effort=state['reasoning_effort'],
-        thinking_budget=-1 if state.get('enable_thinking', True) else 0,
-        bos_token=shared.bos_token,
-        eos_token=shared.eos_token,
+        thinking_budget=-1 if state.get('enable_thinking', True) else 0
    )

    chat_renderer = partial(
@ -477,7 +475,7 @@ def get_stopping_strings(state):

    if state['mode'] in ['instruct', 'chat-instruct']:
        template = jinja_env.from_string(state['instruction_template_str'])
-        renderer = partial(template.render, add_generation_prompt=False, bos_token=shared.bos_token, eos_token=shared.eos_token)
+        renderer = partial(template.render, add_generation_prompt=False)
        renderers.append(renderer)

    if state['mode'] in ['chat']:
@ -1518,6 +1516,20 @@ def load_instruction_template_memoized(template):
    return load_instruction_template(template)


+def open_image_safely(path):
+    if path is None or not isinstance(path, str) or not Path(path).exists():
+        return None
+
+    if os.path.islink(path):
+        return None
+
+    try:
+        return Image.open(path)
+    except Exception as e:
+        logger.error(f"Failed to open image file: {path}. Reason: {e}")
+        return None
+
+
 def upload_character(file, img_path, tavern=False):
    img = open_image_safely(img_path)
    decoded_file = file if isinstance(file, str) else file.decode('utf-8')
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@ -196,45 +196,50 @@ def extract_thinking_block(string):
    return None, string


-def build_thinking_block(thinking_content, message_id, has_remaining_content):
-    """Build HTML for a thinking block."""
-    if thinking_content is None:
-        return None
-
-    # Process the thinking content through markdown
-    thinking_html = process_markdown_content(thinking_content)
-
-    # Generate unique ID for the thinking block
-    block_id = f"thinking-{message_id}-0"
-
-    # Check if thinking is complete or still in progress
-    is_streaming = not has_remaining_content
-    title_text = "Thinking..." if is_streaming else "Thought"
-
-    return f'''
-    <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
-        <summary class="thinking-header">
-            {info_svg_small}
-            <span class="thinking-title">{title_text}</span>
-        </summary>
-        <div class="thinking-content pretty_scrollbar">{thinking_html}</div>
-    </details>
-    '''
-
-
-def build_main_content_block(content):
-    """Build HTML for the main content block."""
-    if not content:
+@functools.lru_cache(maxsize=None)
+def convert_to_markdown(string, message_id=None):
+    if not string:
        return ""

-    return process_markdown_content(content)
+    # Use a default message ID if none provided
+    if message_id is None:
+        message_id = "unknown"
+
+    # Extract thinking block if present
+    thinking_content, remaining_content = extract_thinking_block(string)
+
+    # Process the main content
+    html_output = process_markdown_content(remaining_content)
+
+    # If thinking content was found, process it using the same function
+    if thinking_content is not None:
+        thinking_html = process_markdown_content(thinking_content)
+
+        # Generate unique ID for the thinking block
+        block_id = f"thinking-{message_id}-0"
+
+        # Check if thinking is complete or still in progress
+        is_streaming = not remaining_content
+        title_text = "Thinking..." if is_streaming else "Thought"
+
+        thinking_block = f'''
+        <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
+            <summary class="thinking-header">
+                {info_svg_small}
+                <span class="thinking-title">{title_text}</span>
+            </summary>
+            <div class="thinking-content pretty_scrollbar">{thinking_html}</div>
+        </details>
+        '''
+
+        # Prepend the thinking block to the message HTML
+        html_output = thinking_block + html_output
+
+    return html_output


 def process_markdown_content(string):
-    """
-    Process a string through the markdown conversion pipeline.
-    Uses robust manual parsing to ensure correct LaTeX and Code Block rendering.
-    """
+    """Process a string through the markdown conversion pipeline."""
    if not string:
        return ""

@ -275,7 +280,7 @@ def process_markdown_content(string):
    pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
    string = pattern.sub(replace_blockquote, string)

-    # Code block standardization
+    # Code
    string = string.replace('\\begin{code}', '```')
    string = string.replace('\\end{code}', '```')
    string = string.replace('\\begin{align*}', '$$')
@ -296,7 +301,6 @@ def process_markdown_content(string):
    is_code = False
    is_latex = False

-    # Manual line iteration for robust structure parsing
    for line in string.split('\n'):
        stripped_line = line.strip()

@ -367,39 +371,6 @@ def process_markdown_content(string):
    return html_output


-@functools.lru_cache(maxsize=None)
-def convert_to_markdown(string, message_id=None):
-    """
-    Convert a string to markdown HTML with support for multiple block types.
-    Blocks are assembled in order: thinking, main content, etc.
-    """
-    if not string:
-        return ""
-
-    # Use a default message ID if none provided
-    if message_id is None:
-        message_id = "unknown"
-
-    # Extract different components from the string
-    thinking_content, remaining_content = extract_thinking_block(string)
-
-    # Build individual HTML blocks
-    blocks = []
-
-    # Add thinking block if present
-    thinking_html = build_thinking_block(thinking_content, message_id, bool(remaining_content))
-    if thinking_html:
-        blocks.append(thinking_html)
-
-    # Add main content block
-    main_html = build_main_content_block(remaining_content)
-    if main_html:
-        blocks.append(main_html)
-
-    # Assemble all blocks into final HTML
-    return ''.join(blocks)
-
-
 def convert_to_markdown_wrapped(string, message_id=None, use_cache=True):
    '''
    Used to avoid caching convert_to_markdown calls during streaming.
--- a/modules/image_models.py
+++ b/modules/image_models.py
@ -1,200 +0,0 @@
-import time
-
-import modules.shared as shared
-from modules.logging_colors import logger
-from modules.utils import resolve_model_path
-
-
-def get_quantization_config(quant_method):
-    """
-    Get the appropriate quantization config based on the selected method.
-    Applies quantization to both the transformer and the text_encoder.
-    """
-    import torch
-    # Import BitsAndBytesConfig from BOTH libraries to be safe
-    from diffusers import BitsAndBytesConfig as DiffusersBnBConfig
-    from diffusers import TorchAoConfig
-    from diffusers.quantizers import PipelineQuantizationConfig
-    from transformers import BitsAndBytesConfig as TransformersBnBConfig
-
-    if quant_method == 'none' or not quant_method:
-        return None
-
-    # Bitsandbytes 8-bit quantization
-    elif quant_method == 'bnb-8bit':
-        return PipelineQuantizationConfig(
-            quant_mapping={
-                "transformer": DiffusersBnBConfig(
-                    load_in_8bit=True
-                ),
-                "text_encoder": TransformersBnBConfig(
-                    load_in_8bit=True
-                )
-            }
-        )
-
-    # Bitsandbytes 4-bit quantization
-    elif quant_method == 'bnb-4bit':
-        return PipelineQuantizationConfig(
-            quant_mapping={
-                "transformer": DiffusersBnBConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_quant_type="nf4",
-                    bnb_4bit_compute_dtype=torch.bfloat16,
-                    bnb_4bit_use_double_quant=True
-                ),
-                "text_encoder": TransformersBnBConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_quant_type="nf4",
-                    bnb_4bit_compute_dtype=torch.bfloat16,
-                    bnb_4bit_use_double_quant=True
-                )
-            }
-        )
-
-    # torchao int8 weight-only
-    elif quant_method == 'torchao-int8wo':
-        return PipelineQuantizationConfig(
-            quant_mapping={
-                "transformer": TorchAoConfig("int8wo"),
-                "text_encoder": TorchAoConfig("int8wo")
-            }
-        )
-
-    # torchao fp4 (e2m1)
-    elif quant_method == 'torchao-fp4':
-        return PipelineQuantizationConfig(
-            quant_mapping={
-                "transformer": TorchAoConfig("fp4_e2m1"),
-                "text_encoder": TorchAoConfig("fp4_e2m1")
-            }
-        )
-
-    # torchao float8 weight-only
-    elif quant_method == 'torchao-float8wo':
-        return PipelineQuantizationConfig(
-            quant_mapping={
-                "transformer": TorchAoConfig("float8wo"),
-                "text_encoder": TorchAoConfig("float8wo")
-            }
-        )
-
-    else:
-        logger.warning(f"Unknown quantization method: {quant_method}. Loading without quantization.")
-        return None
-
-
-def get_pipeline_type(pipe):
-    """
-    Detect the pipeline type based on the loaded pipeline class.
-
-    Returns:
-        str: 'zimage', 'qwenimage', or 'unknown'
-    """
-    class_name = pipe.__class__.__name__
-    if class_name == 'ZImagePipeline':
-        return 'zimage'
-    elif class_name == 'QwenImagePipeline':
-        return 'qwenimage'
-    else:
-        return 'unknown'
-
-
-def load_image_model(model_name, dtype='bfloat16', attn_backend='sdpa', cpu_offload=False, compile_model=False, quant_method='none'):
-    """
-    Load a diffusers image generation model.
-
-    Args:
-        model_name: Name of the model directory
-        dtype: 'bfloat16' or 'float16'
-        attn_backend: 'sdpa' or 'flash_attention_2'
-        cpu_offload: Enable CPU offloading for low VRAM
-        compile_model: Compile the model for faster inference (slow first run)
-        quant_method: 'none', 'bnb-8bit', 'bnb-4bit', or torchao options (int8wo, fp4, float8wo)
-    """
-    import torch
-    from diffusers import DiffusionPipeline
-
-    from modules.torch_utils import get_device
-
-    logger.info(f"Loading image model \"{model_name}\" with quantization: {quant_method}")
-    t0 = time.time()
-
-    dtype_map = {"bfloat16": torch.bfloat16, "float16": torch.float16}
-    target_dtype = dtype_map.get(dtype, torch.bfloat16)
-
-    model_path = resolve_model_path(model_name, image_model=True)
-
-    try:
-        # Get quantization config based on selected method
-        pipeline_quant_config = get_quantization_config(quant_method)
-
-        # Load the pipeline
-        load_kwargs = {
-            "torch_dtype": target_dtype,
-            "low_cpu_mem_usage": True,
-        }
-
-        if pipeline_quant_config is not None:
-            load_kwargs["quantization_config"] = pipeline_quant_config
-
-        # Use DiffusionPipeline for automatic pipeline detection
-        # This handles both ZImagePipeline and QwenImagePipeline
-        pipe = DiffusionPipeline.from_pretrained(
-            str(model_path),
-            **load_kwargs
-        )
-
-        pipeline_type = get_pipeline_type(pipe)
-
-        if not cpu_offload:
-            pipe.to(get_device())
-
-        modules = ["transformer", "unet"]
-
-        # Set attention backend
-        if attn_backend == 'flash_attention_2':
-            for name in modules:
-                mod = getattr(pipe, name, None)
-                if hasattr(mod, "set_attention_backend"):
-                    mod.set_attention_backend("flash")
-                    break
-
-        # Compile model
-        if compile_model:
-            for name in modules:
-                mod = getattr(pipe, name, None)
-                if hasattr(mod, "compile"):
-                    logger.info("Compiling model (first run will be slow)...")
-                    mod.compile()
-                    break
-
-        if cpu_offload:
-            pipe.enable_model_cpu_offload()
-
-        shared.image_model = pipe
-        shared.image_model_name = model_name
-        shared.image_pipeline_type = pipeline_type
-
-        logger.info(f"Loaded image model \"{model_name}\" in {(time.time() - t0):.2f} seconds.")
-        return pipe
-
-    except Exception as e:
-        logger.error(f"Failed to load image model: {str(e)}")
-        return None
-
-
-def unload_image_model():
-    """Unload the current image model and free VRAM."""
-    if shared.image_model is None:
-        return
-
-    del shared.image_model
-    shared.image_model = None
-    shared.image_model_name = 'None'
-    shared.image_pipeline_type = None
-
-    from modules.torch_utils import clear_torch_cache
-    clear_torch_cache()
-
-    logger.info("Image model unloaded.")
--- a/modules/image_utils.py
+++ b/modules/image_utils.py
@ -1,7 +1,9 @@
+"""
+Shared image processing utilities for multimodal support.
+Used by both ExLlamaV3 and llama.cpp implementations.
+"""
 import base64
 import io
-import os
-from pathlib import Path
 from typing import Any, List, Tuple

 from PIL import Image
@ -9,20 +11,6 @@ from PIL import Image
 from modules.logging_colors import logger


-def open_image_safely(path):
-    if path is None or not isinstance(path, str) or not Path(path).exists():
-        return None
-
-    if os.path.islink(path):
-        return None
-
-    try:
-        return Image.open(path)
-    except Exception as e:
-        logger.error(f"Failed to open image file: {path}. Reason: {e}")
-        return None
-
-
 def convert_pil_to_base64(image: Image.Image) -> str:
    """Converts a PIL Image to a base64 encoded string."""
    buffered = io.BytesIO()
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -317,7 +317,6 @@ class LlamaServer:
            "--ctx-size", str(shared.args.ctx_size),
            "--gpu-layers", str(shared.args.gpu_layers),
            "--batch-size", str(shared.args.batch_size),
-            "--ubatch-size", str(shared.args.ubatch_size),
            "--port", str(self.port),
            "--no-webui",
            "--flash-attn", "on",
@ -327,8 +326,6 @@ class LlamaServer:
            cmd += ["--threads", str(shared.args.threads)]
        if shared.args.threads_batch > 0:
            cmd += ["--threads-batch", str(shared.args.threads_batch)]
-        if shared.args.cpu_moe:
-            cmd.append("--cpu-moe")
        if shared.args.no_mmap:
            cmd.append("--no-mmap")
        if shared.args.mlock:
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -6,11 +6,9 @@ import gradio as gr
 loaders_and_params = OrderedDict({
    'llama.cpp': [
        'gpu_layers',
-        'cpu_moe',
        'threads',
        'threads_batch',
        'batch_size',
-        'ubatch_size',
        'ctx_size',
        'cache_type',
        'tensor_split',
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -89,8 +89,8 @@ def get_model_metadata(model):
            else:
                bos_token = ""

-            shared.bos_token = bos_token
-            shared.eos_token = eos_token
+            template = template.replace('eos_token', "'{}'".format(eos_token))
+            template = template.replace('bos_token', "'{}'".format(bos_token))

            template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
            template = re.sub(r'raise_exception\([^)]*\)', "''", template)
@ -160,16 +160,13 @@ def get_model_metadata(model):

        # 4. If a template was found from any source, process it
        if template:
-            shared.bos_token = '<s>'
-            shared.eos_token = '</s>'
-
            for k in ['eos_token', 'bos_token']:
                if k in metadata:
                    value = metadata[k]
                    if isinstance(value, dict):
                        value = value['content']

-                    setattr(shared, k, value)
+                    template = template.replace(k, "'{}'".format(value))

            template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
            template = re.sub(r'raise_exception\([^)]*\)', "''", template)
--- a/modules/shared.py
+++ b/modules/shared.py
@ -11,7 +11,7 @@ import yaml
 from modules.logging_colors import logger
 from modules.presets import default_preset

-# Text model variables
+# Model variables
 model = None
 tokenizer = None
 model_name = 'None'
@ -19,13 +19,6 @@ is_seq2seq = False
 is_multimodal = False
 model_dirty_from_training = False
 lora_names = []
-bos_token = '<s>'
-eos_token = '</s>'
-
-# Image model variables
-image_model = None
-image_model_name = 'None'
-image_pipeline_type = None

 # Generation variables
 stop_everything = False
@ -53,18 +46,6 @@ group.add_argument('--extensions', type=str, nargs='+', help='The list of extens
 group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
 group.add_argument('--idle-timeout', type=int, default=0, help='Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.')

-# Image generation
-group = parser.add_argument_group('Image model')
-group.add_argument('--image-model', type=str, help='Name of the image model to select on startup (overrides saved setting).')
-group.add_argument('--image-model-dir', type=str, default='user_data/image_models', help='Path to directory with all the image models.')
-group.add_argument('--image-dtype', type=str, default=None, choices=['bfloat16', 'float16'], help='Data type for image model.')
-group.add_argument('--image-attn-backend', type=str, default=None, choices=['flash_attention_2', 'sdpa'], help='Attention backend for image model.')
-group.add_argument('--image-cpu-offload', action='store_true', help='Enable CPU offloading for image model.')
-group.add_argument('--image-compile', action='store_true', help='Compile the image model for faster inference.')
-group.add_argument('--image-quant', type=str, default=None,
-                   choices=['none', 'bnb-8bit', 'bnb-4bit', 'torchao-int8wo', 'torchao-fp4', 'torchao-float8wo'],
-                   help='Quantization method for image model.')
-
 # Model loader
 group = parser.add_argument_group('Model loader')
 group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.')
@ -85,7 +66,6 @@ group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the pr
 # llama.cpp
 group = parser.add_argument_group('llama.cpp')
 group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
-group.add_argument('--cpu-moe', action='store_true', help='Move the experts to the CPU (for MoE models).')
 group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.')
 group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
 group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
@ -93,8 +73,7 @@ group.add_argument('--row-split', action='store_true', help='Split the model by
 group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
-group.add_argument('--batch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size.')
-group.add_argument('--ubatch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the max physical batch size for computation (device level).')
+group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
 group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
@ -309,26 +288,6 @@ settings = {

    # Extensions
    'default_extensions': [],
-
-    # Image generation settings
-    'image_prompt': '',
-    'image_neg_prompt': '',
-    'image_width': 1024,
-    'image_height': 1024,
-    'image_aspect_ratio': '1:1 Square',
-    'image_steps': 9,
-    'image_cfg_scale': 0.0,
-    'image_seed': -1,
-    'image_batch_size': 1,
-    'image_batch_count': 1,
-    'image_llm_variations': False,
-    'image_llm_variations_prompt': 'Write a variation of the image generation prompt above. Consider the intent of the user with that prompt and write something that will likely please them, with added details. Output only the new prompt. Do not add any explanations, prefixes, or additional text.',
-    'image_model_menu': 'None',
-    'image_dtype': 'bfloat16',
-    'image_attn_backend': 'flash_attention_2',
-    'image_cpu_offload': False,
-    'image_compile': False,
-    'image_quant': 'none',
 }

 default_settings = copy.deepcopy(settings)
@ -353,22 +312,6 @@ def do_cmd_flags_warnings():
                logger.warning('\nThe multi-user mode is highly experimental and should not be shared publicly.')


-def apply_image_model_cli_overrides():
-    """Apply command-line overrides for image model settings."""
-    if args.image_model is not None:
-        settings['image_model_menu'] = args.image_model
-    if args.image_dtype is not None:
-        settings['image_dtype'] = args.image_dtype
-    if args.image_attn_backend is not None:
-        settings['image_attn_backend'] = args.image_attn_backend
-    if args.image_cpu_offload:
-        settings['image_cpu_offload'] = True
-    if args.image_compile:
-        settings['image_compile'] = True
-    if args.image_quant is not None:
-        settings['image_quant'] = args.image_quant
-
-
 def fix_loader_name(name):
    if not name:
        return name
--- a/modules/training.py
+++ b/modules/training.py
@ -611,7 +611,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
            bf16=shared.args.bf16,
            optim=optimizer,
            logging_steps=2 if stop_at_loss > 0 else 5,
-            eval_strategy="steps" if eval_data is not None else "no",
+            evaluation_strategy="steps" if eval_data is not None else "no",
            eval_steps=math.ceil(eval_steps / gradient_accumulation_steps) if eval_data is not None else None,
            save_strategy="steps" if eval_data is not None else "no",
            output_dir=lora_file_path,
@ -620,7 +620,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
            # TODO: Enable multi-device support
            ddp_find_unused_parameters=None,
            no_cuda=shared.args.cpu,
-            # use_ipex=True if is_torch_xpu_available() and not shared.args.cpu else False
+            use_ipex=True if is_torch_xpu_available() and not shared.args.cpu else False
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
        callbacks=list([Callbacks()])
--- a/modules/ui.py
+++ b/modules/ui.py
@ -125,11 +125,9 @@ def list_model_elements():
        'loader',
        'cpu_memory',
        'gpu_layers',
-        'cpu_moe',
        'threads',
        'threads_batch',
        'batch_size',
-        'ubatch_size',
        'ctx_size',
        'cache_type',
        'tensor_split',
@ -280,29 +278,6 @@ def list_interface_input_elements():
        'include_past_attachments',
    ]

-    if not shared.args.portable:
-        # Image generation elements
-        elements += [
-            'image_prompt',
-            'image_neg_prompt',
-            'image_width',
-            'image_height',
-            'image_aspect_ratio',
-            'image_steps',
-            'image_cfg_scale',
-            'image_seed',
-            'image_batch_size',
-            'image_batch_count',
-            'image_llm_variations',
-            'image_llm_variations_prompt',
-            'image_model_menu',
-            'image_dtype',
-            'image_attn_backend',
-            'image_compile',
-            'image_cpu_offload',
-            'image_quant',
-        ]
-
    return elements


@ -532,33 +507,9 @@ def setup_auto_save():
        'theme_state',
        'show_two_notebook_columns',
        'paste_to_attachment',
-        'include_past_attachments',
-
+        'include_past_attachments'
    ]

-    if not shared.args.portable:
-        # Image generation tab (ui_image_generation.py)
-        change_elements += [
-            'image_prompt',
-            'image_neg_prompt',
-            'image_width',
-            'image_height',
-            'image_aspect_ratio',
-            'image_steps',
-            'image_cfg_scale',
-            'image_seed',
-            'image_batch_size',
-            'image_batch_count',
-            'image_llm_variations',
-            'image_llm_variations_prompt',
-            'image_model_menu',
-            'image_dtype',
-            'image_attn_backend',
-            'image_compile',
-            'image_cpu_offload',
-            'image_quant',
-        ]
-
    for element_name in change_elements:
        if element_name in shared.gradio:
            shared.gradio[element_name].change(
--- a/modules/ui_image_generation.py
+++ b/modules/ui_image_generation.py
@ -1,993 +0,0 @@
-import json
-import os
-import random
-import time
-import traceback
-from datetime import datetime
-from pathlib import Path
-
-import gradio as gr
-from PIL.PngImagePlugin import PngInfo
-
-from modules import shared, ui, utils
-from modules.image_models import (
-    get_pipeline_type,
-    load_image_model,
-    unload_image_model
-)
-from modules.image_utils import open_image_safely
-from modules.logging_colors import logger
-from modules.text_generation import stop_everything_event
-from modules.utils import check_model_loaded, gradio
-
-ASPECT_RATIOS = {
-    "1:1 Square": (1, 1),
-    "16:9 Cinema": (16, 9),
-    "9:16 Mobile": (9, 16),
-    "4:3 Photo": (4, 3),
-    "Custom": None,
-}
-
-STEP = 16
-IMAGES_PER_PAGE = 32
-
-# Settings keys to save in PNG metadata (Generate tab only)
-METADATA_SETTINGS_KEYS = [
-    'image_prompt',
-    'image_neg_prompt',
-    'image_width',
-    'image_height',
-    'image_aspect_ratio',
-    'image_steps',
-    'image_seed',
-    'image_cfg_scale',
-]
-
-# Cache for all image paths
-_image_cache = []
-_cache_timestamp = 0
-
-
-def round_to_step(value, step=STEP):
-    return round(value / step) * step
-
-
-def clamp(value, min_val, max_val):
-    return max(min_val, min(max_val, value))
-
-
-def apply_aspect_ratio(aspect_ratio, current_width, current_height):
-    if aspect_ratio == "Custom" or aspect_ratio not in ASPECT_RATIOS:
-        return current_width, current_height
-
-    w_ratio, h_ratio = ASPECT_RATIOS[aspect_ratio]
-
-    if w_ratio == h_ratio:
-        base = min(current_width, current_height)
-        new_width = base
-        new_height = base
-    elif w_ratio < h_ratio:
-        new_width = current_width
-        new_height = round_to_step(current_width * h_ratio / w_ratio)
-    else:
-        new_height = current_height
-        new_width = round_to_step(current_height * w_ratio / h_ratio)
-
-    new_width = clamp(new_width, 256, 2048)
-    new_height = clamp(new_height, 256, 2048)
-
-    return int(new_width), int(new_height)
-
-
-def update_height_from_width(width, aspect_ratio):
-    if aspect_ratio == "Custom" or aspect_ratio not in ASPECT_RATIOS:
-        return gr.update()
-
-    w_ratio, h_ratio = ASPECT_RATIOS[aspect_ratio]
-    new_height = round_to_step(width * h_ratio / w_ratio)
-    new_height = clamp(new_height, 256, 2048)
-
-    return int(new_height)
-
-
-def update_width_from_height(height, aspect_ratio):
-    if aspect_ratio == "Custom" or aspect_ratio not in ASPECT_RATIOS:
-        return gr.update()
-
-    w_ratio, h_ratio = ASPECT_RATIOS[aspect_ratio]
-    new_width = round_to_step(height * w_ratio / h_ratio)
-    new_width = clamp(new_width, 256, 2048)
-
-    return int(new_width)
-
-
-def swap_dimensions_and_update_ratio(width, height, aspect_ratio):
-    new_width, new_height = height, width
-
-    new_ratio = "Custom"
-    for name, ratios in ASPECT_RATIOS.items():
-        if ratios is None:
-            continue
-        w_r, h_r = ratios
-        expected_height = new_width * h_r / w_r
-        if abs(expected_height - new_height) < STEP:
-            new_ratio = name
-            break
-
-    return new_width, new_height, new_ratio
-
-
-def build_generation_metadata(state, actual_seed):
-    """Build metadata dict from generation settings."""
-    metadata = {}
-    for key in METADATA_SETTINGS_KEYS:
-        if key in state:
-            metadata[key] = state[key]
-
-    # Store the actual seed used (not -1)
-    metadata['image_seed'] = actual_seed
-    metadata['generated_at'] = datetime.now().isoformat()
-    metadata['model'] = shared.image_model_name
-
-    return metadata
-
-
-def save_generated_images(images, state, actual_seed):
-    """Save images with generation metadata embedded in PNG. Returns list of saved file paths."""
-    if shared.args.multi_user:
-        return []
-
-    date_str = datetime.now().strftime("%Y-%m-%d")
-    folder_path = os.path.join("user_data", "image_outputs", date_str)
-    os.makedirs(folder_path, exist_ok=True)
-
-    metadata = build_generation_metadata(state, actual_seed)
-    metadata_json = json.dumps(metadata, ensure_ascii=False)
-
-    saved_paths = []
-    for idx, img in enumerate(images):
-        timestamp = datetime.now().strftime("%H-%M-%S")
-        filename = f"TGW_{timestamp}_{actual_seed:010d}_{idx:03d}.png"
-        filepath = os.path.join(folder_path, filename)
-
-        # Create PNG metadata
-        png_info = PngInfo()
-        png_info.add_text("image_gen_settings", metadata_json)
-
-        # Save with metadata
-        img.save(filepath, pnginfo=png_info)
-        saved_paths.append(filepath)
-
-    return saved_paths
-
-
-def read_image_metadata(image_path):
-    """Read generation metadata from PNG file."""
-    try:
-        img = open_image_safely(image_path)
-        if img is None:
-            return None
-        try:
-            if hasattr(img, 'text') and 'image_gen_settings' in img.text:
-                return json.loads(img.text['image_gen_settings'])
-        finally:
-            img.close()
-    except Exception as e:
-        logger.debug(f"Could not read metadata from {image_path}: {e}")
-    return None
-
-
-def format_metadata_for_display(metadata):
-    """Format metadata as readable text."""
-    if not metadata:
-        return "No generation settings found in this image."
-
-    lines = []
-
-    # Display in a nice order
-    display_order = [
-        ('image_prompt', 'Prompt'),
-        ('image_neg_prompt', 'Negative Prompt'),
-        ('image_width', 'Width'),
-        ('image_height', 'Height'),
-        ('image_aspect_ratio', 'Aspect Ratio'),
-        ('image_steps', 'Steps'),
-        ('image_cfg_scale', 'CFG Scale'),
-        ('image_seed', 'Seed'),
-        ('model', 'Model'),
-        ('generated_at', 'Generated At'),
-    ]
-
-    for key, label in display_order:
-        if key in metadata:
-            value = metadata[key]
-            if key in ['image_prompt', 'image_neg_prompt'] and value:
-                # Truncate long prompts for display
-                if len(str(value)) > 200:
-                    value = str(value)[:200] + "..."
-            lines.append(f"**{label}:** {value}")
-
-    return "\n\n".join(lines)
-
-
-def get_all_history_images(force_refresh=False):
-    """Get all history images sorted by modification time (newest first). Uses caching."""
-    global _image_cache, _cache_timestamp
-
-    output_dir = os.path.join("user_data", "image_outputs")
-    if not os.path.exists(output_dir):
-        return []
-
-    # Check if we need to refresh cache
-    current_time = time.time()
-    if not force_refresh and _image_cache and (current_time - _cache_timestamp) < 2:
-        return _image_cache
-
-    image_files = []
-    for root, _, files in os.walk(output_dir):
-        for file in files:
-            if file.endswith((".png", ".jpg", ".jpeg")):
-                full_path = os.path.join(root, file)
-                image_files.append((full_path, os.path.getmtime(full_path)))
-
-    image_files.sort(key=lambda x: x[1], reverse=True)
-    _image_cache = [x[0] for x in image_files]
-    _cache_timestamp = current_time
-
-    return _image_cache
-
-
-def get_paginated_images(page=0, force_refresh=False):
-    """Get images for a specific page."""
-    all_images = get_all_history_images(force_refresh)
-    total_images = len(all_images)
-    total_pages = max(1, (total_images + IMAGES_PER_PAGE - 1) // IMAGES_PER_PAGE)
-
-    # Clamp page to valid range
-    page = max(0, min(page, total_pages - 1))
-
-    start_idx = page * IMAGES_PER_PAGE
-    end_idx = min(start_idx + IMAGES_PER_PAGE, total_images)
-
-    page_images = all_images[start_idx:end_idx]
-
-    return page_images, page, total_pages, total_images
-
-
-def get_initial_page_info():
-    """Get page info string for initial load."""
-    _, page, total_pages, total_images = get_paginated_images(0)
-    return f"Page {page + 1} of {total_pages} ({total_images} total images)"
-
-
-def refresh_gallery(current_page=0):
-    """Refresh gallery with current page."""
-    images, page, total_pages, total_images = get_paginated_images(current_page, force_refresh=True)
-    page_info = f"Page {page + 1} of {total_pages} ({total_images} total images)"
-    return images, page, page_info
-
-
-def go_to_page(page_num, current_page):
-    """Go to a specific page (1-indexed input)."""
-    try:
-        page = int(page_num) - 1  # Convert to 0-indexed
-    except (ValueError, TypeError):
-        page = current_page
-
-    images, page, total_pages, total_images = get_paginated_images(page)
-    page_info = f"Page {page + 1} of {total_pages} ({total_images} total images)"
-    return images, page, page_info
-
-
-def next_page(current_page):
-    """Go to next page."""
-    images, page, total_pages, total_images = get_paginated_images(current_page + 1)
-    page_info = f"Page {page + 1} of {total_pages} ({total_images} total images)"
-    return images, page, page_info
-
-
-def prev_page(current_page):
-    """Go to previous page."""
-    images, page, total_pages, total_images = get_paginated_images(current_page - 1)
-    page_info = f"Page {page + 1} of {total_pages} ({total_images} total images)"
-    return images, page, page_info
-
-
-def on_gallery_select(evt: gr.SelectData, current_page):
-    """Handle image selection from gallery."""
-    if evt.index is None:
-        return "", "Select an image to view its settings"
-
-    if not _image_cache:
-        get_all_history_images()
-
-    all_images = _image_cache
-    total_images = len(all_images)
-
-    # Calculate the actual index in the full list
-    start_idx = current_page * IMAGES_PER_PAGE
-    actual_idx = start_idx + evt.index
-
-    if actual_idx >= total_images:
-        return "", "Image not found"
-
-    image_path = all_images[actual_idx]
-    metadata = read_image_metadata(image_path)
-    metadata_display = format_metadata_for_display(metadata)
-
-    return image_path, metadata_display
-
-
-def send_to_generate(selected_image_path):
-    """Load settings from selected image and return updates for all Generate tab inputs."""
-    if not selected_image_path or not os.path.exists(selected_image_path):
-        return [gr.update()] * 8 + ["No image selected"]
-
-    metadata = read_image_metadata(selected_image_path)
-    if not metadata:
-        return [gr.update()] * 8 + ["No settings found in this image"]
-
-    # Return updates for each input element in order
-    updates = [
-        gr.update(value=metadata.get('image_prompt', '')),
-        gr.update(value=metadata.get('image_neg_prompt', '')),
-        gr.update(value=metadata.get('image_width', 1024)),
-        gr.update(value=metadata.get('image_height', 1024)),
-        gr.update(value=metadata.get('image_aspect_ratio', '1:1 Square')),
-        gr.update(value=metadata.get('image_steps', 9)),
-        gr.update(value=metadata.get('image_seed', -1)),
-        gr.update(value=metadata.get('image_cfg_scale', 0.0)),
-    ]
-
-    status = f"✓ Settings loaded from image (seed: {metadata.get('image_seed', 'unknown')})"
-    return updates + [status]
-
-
-def read_dropped_image_metadata(image_path):
-    """Read metadata from a dropped/uploaded image."""
-    if not image_path:
-        return "Drop an image to view its generation settings."
-
-    metadata = read_image_metadata(image_path)
-    return format_metadata_for_display(metadata)
-
-
-def create_ui():
-    if shared.settings['image_model_menu'] != 'None':
-        shared.image_model_name = shared.settings['image_model_menu']
-
-    with gr.Tab("Image AI", elem_id="image-ai-tab"):
-        with gr.Tabs():
-            # TAB 1: GENERATE
-            with gr.TabItem("Generate"):
-                with gr.Row():
-                    with gr.Column(scale=4, min_width=350):
-                        shared.gradio['image_prompt'] = gr.Textbox(
-                            label="Prompt",
-                            placeholder="Describe your imagination...",
-                            lines=3,
-                            autofocus=True,
-                            value=shared.settings['image_prompt']
-                        )
-                        shared.gradio['image_neg_prompt'] = gr.Textbox(
-                            label="Negative Prompt",
-                            placeholder="Low quality...",
-                            lines=3,
-                            value=shared.settings['image_neg_prompt']
-                        )
-                        shared.gradio['image_llm_variations'] = gr.Checkbox(
-                            value=shared.settings['image_llm_variations'],
-                            label='LLM Prompt Variations',
-                            elem_id="llm-prompt-variations",
-                        )
-                        shared.gradio['image_llm_variations_prompt'] = gr.Textbox(
-                            value=shared.settings['image_llm_variations_prompt'],
-                            label='Variation Prompt',
-                            lines=3,
-                            placeholder='Instructions for generating prompt variations...',
-                            visible=shared.settings['image_llm_variations'],
-                            info='Use the loaded LLM to generate creative prompt variations for each sequential batch.'
-                        )
-
-                        shared.gradio['image_generate_btn'] = gr.Button("Generate", variant="primary", size="lg")
-                        shared.gradio['image_stop_btn'] = gr.Button("Stop", size="lg", visible=False)
-                        shared.gradio['image_progress'] = gr.HTML(
-                            value=progress_bar_html(),
-                            elem_id="image-progress"
-                        )
-
-                        gr.Markdown("### Dimensions")
-                        with gr.Row():
-                            with gr.Column():
-                                shared.gradio['image_width'] = gr.Slider(256, 2048, value=shared.settings['image_width'], step=STEP, label="Width")
-                            with gr.Column():
-                                shared.gradio['image_height'] = gr.Slider(256, 2048, value=shared.settings['image_height'], step=STEP, label="Height")
-                            shared.gradio['image_swap_btn'] = gr.Button("⇄ Swap", elem_classes='refresh-button', scale=0, min_width=80, elem_id="swap-height-width")
-
-                        with gr.Row():
-                            shared.gradio['image_aspect_ratio'] = gr.Radio(
-                                choices=["1:1 Square", "16:9 Cinema", "9:16 Mobile", "4:3 Photo", "Custom"],
-                                value=shared.settings['image_aspect_ratio'],
-                                label="Aspect Ratio",
-                                interactive=True
-                            )
-
-                        gr.Markdown("### Config")
-                        with gr.Row():
-                            with gr.Column():
-                                shared.gradio['image_steps'] = gr.Slider(1, 100, value=shared.settings['image_steps'], step=1, label="Steps")
-                                shared.gradio['image_cfg_scale'] = gr.Slider(
-                                    0.0, 10.0,
-                                    value=shared.settings['image_cfg_scale'],
-                                    step=0.1,
-                                    label="CFG Scale",
-                                    info="Z-Image Turbo: 0.0 | Qwen: 4.0"
-                                )
-                                shared.gradio['image_seed'] = gr.Number(label="Seed", value=shared.settings['image_seed'], precision=0, info="-1 = Random")
-
-                            with gr.Column():
-                                shared.gradio['image_batch_size'] = gr.Slider(1, 32, value=shared.settings['image_batch_size'], step=1, label="Batch Size (VRAM Heavy)", info="Generates N images at once.")
-                                shared.gradio['image_batch_count'] = gr.Slider(1, 128, value=shared.settings['image_batch_count'], step=1, label="Sequential Count (Loop)", info="Repeats the generation N times.")
-
-                    with gr.Column(scale=6, min_width=500):
-                        with gr.Column(elem_classes=["viewport-container"]):
-                            shared.gradio['image_output_gallery'] = gr.Gallery(label="Output", show_label=False, columns=2, rows=2, height="80vh", object_fit="contain", preview=True, elem_id="image-output-gallery")
-
-            # TAB 2: GALLERY (with pagination)
-            with gr.TabItem("Gallery"):
-                with gr.Row():
-                    with gr.Column(scale=3):
-                        # Pagination controls
-                        with gr.Row():
-                            shared.gradio['image_refresh_history'] = gr.Button("🔄 Refresh", elem_classes="refresh-button")
-                            shared.gradio['image_prev_page'] = gr.Button("◀ Prev Page", elem_classes="refresh-button")
-                            shared.gradio['image_page_info'] = gr.Markdown(value=get_initial_page_info, elem_id="image-page-info")
-                            shared.gradio['image_next_page'] = gr.Button("Next Page ▶", elem_classes="refresh-button")
-                            shared.gradio['image_page_input'] = gr.Number(value=1, label="Page", precision=0, minimum=1, scale=0, min_width=80)
-                            shared.gradio['image_go_to_page'] = gr.Button("Go", elem_classes="refresh-button", scale=0, min_width=50)
-
-                        # State for current page and selected image path
-                        shared.gradio['image_current_page'] = gr.State(value=0)
-                        shared.gradio['image_selected_path'] = gr.State(value="")
-
-                        # Paginated gallery using gr.Gallery
-                        shared.gradio['image_history_gallery'] = gr.Gallery(
-                            value=lambda: get_paginated_images(0)[0],
-                            label="Image History",
-                            show_label=False,
-                            columns=6,
-                            object_fit="cover",
-                            height="auto",
-                            allow_preview=True,
-                            elem_id="image-history-gallery"
-                        )
-
-                    with gr.Column(scale=1):
-                        gr.Markdown("### Generation Settings")
-                        shared.gradio['image_settings_display'] = gr.Markdown("Select an image to view its settings")
-                        shared.gradio['image_send_to_generate'] = gr.Button("Send to Generate", variant="primary")
-                        shared.gradio['image_gallery_status'] = gr.Markdown("")
-
-                        gr.Markdown("### Import Image")
-                        shared.gradio['image_drop_upload'] = gr.Image(
-                            label="Drop image here to view settings",
-                            type="filepath",
-                            height=150
-                        )
-
-            # TAB 3: MODEL
-            with gr.TabItem("Model"):
-                with gr.Row():
-                    with gr.Column():
-                        with gr.Row():
-                            shared.gradio['image_model_menu'] = gr.Dropdown(
-                                choices=utils.get_available_image_models(),
-                                value=shared.settings['image_model_menu'],
-                                label='Model',
-                                elem_classes='slim-dropdown'
-                            )
-                            shared.gradio['image_refresh_models'] = gr.Button("🔄", elem_classes='refresh-button', scale=0, min_width=40)
-                            shared.gradio['image_load_model'] = gr.Button("Load", variant='primary', elem_classes='refresh-button')
-                            shared.gradio['image_unload_model'] = gr.Button("Unload", elem_classes='refresh-button')
-
-                        gr.Markdown("## Settings")
-                        with gr.Row():
-                            with gr.Column():
-                                shared.gradio['image_quant'] = gr.Dropdown(
-                                    label='Quantization',
-                                    choices=['none', 'bnb-8bit', 'bnb-4bit', 'torchao-int8wo', 'torchao-fp4', 'torchao-float8wo'],
-                                    value=shared.settings['image_quant'],
-                                    info='BnB: bitsandbytes quantization. torchao: int8wo, fp4, float8wo.'
-                                )
-
-                                shared.gradio['image_dtype'] = gr.Dropdown(
-                                    choices=['bfloat16', 'float16'],
-                                    value=shared.settings['image_dtype'],
-                                    label='Data Type',
-                                    info='bfloat16 recommended for modern GPUs'
-                                )
-                                shared.gradio['image_attn_backend'] = gr.Dropdown(
-                                    choices=['sdpa', 'flash_attention_2'],
-                                    value=shared.settings['image_attn_backend'],
-                                    label='Attention Backend',
-                                    info='SDPA is default. Flash Attention requires compatible GPU.'
-                                )
-                            with gr.Column():
-                                shared.gradio['image_compile'] = gr.Checkbox(
-                                    value=shared.settings['image_compile'],
-                                    label='Compile Model',
-                                    info='Faster inference after first run. First run will be slow.'
-                                )
-                                shared.gradio['image_cpu_offload'] = gr.Checkbox(
-                                    value=shared.settings['image_cpu_offload'],
-                                    label='CPU Offload',
-                                    info='Enable for low VRAM GPUs. Slower but uses less memory.'
-                                )
-
-                    with gr.Column():
-                        shared.gradio['image_download_path'] = gr.Textbox(
-                            label="Download model",
-                            placeholder="Tongyi-MAI/Z-Image-Turbo",
-                            info="Enter HuggingFace path. Use : for branch, e.g. user/model:main"
-                        )
-                        shared.gradio['image_download_btn'] = gr.Button("Download", variant='primary')
-                        shared.gradio['image_model_status'] = gr.Markdown(value="")
-
-
-def create_event_handlers():
-    # Dimension controls
-    shared.gradio['image_aspect_ratio'].change(
-        apply_aspect_ratio,
-        gradio('image_aspect_ratio', 'image_width', 'image_height'),
-        gradio('image_width', 'image_height'),
-        show_progress=False
-    )
-
-    shared.gradio['image_width'].release(
-        update_height_from_width,
-        gradio('image_width', 'image_aspect_ratio'),
-        gradio('image_height'),
-        show_progress=False
-    )
-
-    shared.gradio['image_height'].release(
-        update_width_from_height,
-        gradio('image_height', 'image_aspect_ratio'),
-        gradio('image_width'),
-        show_progress=False
-    )
-
-    shared.gradio['image_swap_btn'].click(
-        swap_dimensions_and_update_ratio,
-        gradio('image_width', 'image_height', 'image_aspect_ratio'),
-        gradio('image_width', 'image_height', 'image_aspect_ratio'),
-        show_progress=False
-    )
-
-    # Generation
-    shared.gradio['image_generate_btn'].click(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('image_stop_btn', 'image_generate_btn')).then(
-        generate, gradio('interface_state'), gradio('image_output_gallery', 'image_progress'), show_progress=False).then(
-        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('image_stop_btn', 'image_generate_btn'))
-
-    shared.gradio['image_prompt'].submit(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('image_stop_btn', 'image_generate_btn')).then(
-        generate, gradio('interface_state'), gradio('image_output_gallery', 'image_progress'), show_progress=False).then(
-        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('image_stop_btn', 'image_generate_btn'))
-
-    shared.gradio['image_neg_prompt'].submit(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('image_stop_btn', 'image_generate_btn')).then(
-        generate, gradio('interface_state'), gradio('image_output_gallery', 'image_progress'), show_progress=False).then(
-        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('image_stop_btn', 'image_generate_btn'))
-
-    # Stop button
-    shared.gradio['image_stop_btn'].click(
-        stop_everything_event, None, None, show_progress=False
-    )
-
-    # Model management
-    shared.gradio['image_refresh_models'].click(
-        lambda: gr.update(choices=utils.get_available_image_models()),
-        None,
-        gradio('image_model_menu'),
-        show_progress=False
-    )
-
-    shared.gradio['image_load_model'].click(
-        load_image_model_wrapper,
-        gradio('image_model_menu', 'image_dtype', 'image_attn_backend', 'image_cpu_offload', 'image_compile', 'image_quant'),
-        gradio('image_model_status'),
-        show_progress=True
-    )
-
-    shared.gradio['image_unload_model'].click(
-        unload_image_model_wrapper,
-        None,
-        gradio('image_model_status'),
-        show_progress=False
-    )
-
-    shared.gradio['image_download_btn'].click(
-        download_image_model_wrapper,
-        gradio('image_download_path'),
-        gradio('image_model_status', 'image_model_menu'),
-        show_progress=True
-    )
-
-    # Gallery pagination handlers
-    shared.gradio['image_refresh_history'].click(
-        refresh_gallery,
-        gradio('image_current_page'),
-        gradio('image_history_gallery', 'image_current_page', 'image_page_info'),
-        show_progress=False
-    )
-
-    shared.gradio['image_next_page'].click(
-        next_page,
-        gradio('image_current_page'),
-        gradio('image_history_gallery', 'image_current_page', 'image_page_info'),
-        show_progress=False
-    )
-
-    shared.gradio['image_prev_page'].click(
-        prev_page,
-        gradio('image_current_page'),
-        gradio('image_history_gallery', 'image_current_page', 'image_page_info'),
-        show_progress=False
-    )
-
-    shared.gradio['image_go_to_page'].click(
-        go_to_page,
-        gradio('image_page_input', 'image_current_page'),
-        gradio('image_history_gallery', 'image_current_page', 'image_page_info'),
-        show_progress=False
-    )
-
-    # Image selection from gallery
-    shared.gradio['image_history_gallery'].select(
-        on_gallery_select,
-        gradio('image_current_page'),
-        gradio('image_selected_path', 'image_settings_display'),
-        show_progress=False
-    )
-
-    # Send to Generate
-    shared.gradio['image_send_to_generate'].click(
-        send_to_generate,
-        gradio('image_selected_path'),
-        gradio(
-            'image_prompt',
-            'image_neg_prompt',
-            'image_width',
-            'image_height',
-            'image_aspect_ratio',
-            'image_steps',
-            'image_seed',
-            'image_cfg_scale',
-            'image_gallery_status'
-        ),
-        js=f'() => {{{ui.switch_tabs_js}; switch_to_image_ai_generate()}}',
-        show_progress=False
-    )
-
-    shared.gradio['image_drop_upload'].change(
-        read_dropped_image_metadata,
-        gradio('image_drop_upload'),
-        gradio('image_settings_display'),
-        show_progress=False
-    )
-
-    # LLM Variations visibility toggle
-    shared.gradio['image_llm_variations'].change(
-        lambda x: gr.update(visible=x),
-        gradio('image_llm_variations'),
-        gradio('image_llm_variations_prompt'),
-        show_progress=False
-    )
-
-
-def generate_prompt_variation(state):
-    """Generate a creative variation of the image prompt using the LLM."""
-    from modules.chat import generate_chat_prompt
-    from modules.text_generation import generate_reply
-
-    prompt = state['image_prompt']
-
-    # Check if LLM is loaded
-    model_loaded, _ = check_model_loaded()
-    if not model_loaded:
-        logger.warning("No LLM loaded for prompt variation. Using original prompt.")
-        return prompt
-
-    # Get the custom variation prompt or use default
-    variation_instruction = state.get('image_llm_variations_prompt', '')
-    if not variation_instruction:
-        variation_instruction = 'Write a variation of the image generation prompt above. Consider the intent of the user with that prompt and write something that will likely please them, with added details. Output only the new prompt. Do not add any explanations, prefixes, or additional text.'
-
-    augmented_message = f"{prompt}\n\n=====\n\n{variation_instruction}"
-
-    # Use minimal state for generation
-    var_state = state.copy()
-    var_state['history'] = {'internal': [], 'visible': [], 'metadata': {}}
-    var_state['auto_max_new_tokens'] = True
-    var_state['enable_thinking'] = False
-    var_state['reasoning_effort'] = 'low'
-    var_state['start_with'] = ""
-
-    formatted_prompt = generate_chat_prompt(augmented_message, var_state)
-
-    variation = ""
-    for reply in generate_reply(formatted_prompt, var_state, stopping_strings=[], is_chat=True):
-        variation = reply
-
-    # Strip thinking blocks if present
-    if "</think>" in variation:
-        variation = variation.rsplit("</think>", 1)[1]
-    elif "<|start|>assistant<|channel|>final<|message|>" in variation:
-        variation = variation.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1]
-    elif "</seed:think>" in variation:
-        variation = variation.rsplit("</seed:think>", 1)[1]
-
-    variation = variation.strip()
-    if len(variation) >= 2 and variation.startswith('"') and variation.endswith('"'):
-        variation = variation[1:-1]
-
-    if variation:
-        logger.info("Prompt variation:")
-        print(variation)
-        return variation
-
-    return prompt
-
-
-def progress_bar_html(progress=0, text=""):
-    """Generate HTML for progress bar. Empty div when progress <= 0."""
-    if progress <= 0:
-        return '<div class="image-ai-separator"></div>'
-
-    return f'''<div class="image-ai-progress-wrapper">
-        <div class="image-ai-progress-track">
-            <div class="image-ai-progress-fill" style="width: {progress * 100:.1f}%;"></div>
-        </div>
-        <div class="image-ai-progress-text">{text}</div>
-    </div>'''
-
-
-def generate(state, save_images=True):
-    """
-    Generate images using the loaded model.
-    Automatically adjusts parameters based on pipeline type.
-    """
-    import queue
-    import threading
-
-    import torch
-
-    from modules.torch_utils import clear_torch_cache, get_device
-
-    try:
-        model_name = state['image_model_menu']
-
-        if not model_name or model_name == 'None':
-            logger.error("No image model selected. Go to the Model tab and select a model.")
-            yield [], progress_bar_html()
-            return
-
-        if shared.image_model is None:
-            result = load_image_model(
-                model_name,
-                dtype=state['image_dtype'],
-                attn_backend=state['image_attn_backend'],
-                cpu_offload=state['image_cpu_offload'],
-                compile_model=state['image_compile'],
-                quant_method=state['image_quant']
-            )
-            if result is None:
-                logger.error(f"Failed to load model `{model_name}`.")
-                yield [], progress_bar_html()
-                return
-
-            shared.image_model_name = model_name
-
-        seed = state['image_seed']
-        if seed == -1:
-            seed = random.randint(0, 2**32 - 1)
-
-        device = get_device()
-        if device is None:
-            device = "cpu"
-        generator = torch.Generator(device)
-
-        all_images = []
-
-        # Get pipeline type for parameter adjustment
-        pipeline_type = getattr(shared, 'image_pipeline_type', None)
-        if pipeline_type is None:
-            pipeline_type = get_pipeline_type(shared.image_model)
-
-        prompt = state['image_prompt']
-
-        shared.stop_everything = False
-
-        batch_count = int(state['image_batch_count'])
-        steps_per_batch = int(state['image_steps'])
-        total_steps = steps_per_batch * batch_count
-
-        # Queue for progress updates from callback
-        progress_queue = queue.Queue()
-
-        def interrupt_callback(pipe, step_index, timestep, callback_kwargs):
-            if shared.stop_everything:
-                pipe._interrupt = True
-            progress_queue.put(step_index + 1)
-            return callback_kwargs
-
-        gen_kwargs = {
-            "prompt": prompt,
-            "negative_prompt": state['image_neg_prompt'],
-            "height": int(state['image_height']),
-            "width": int(state['image_width']),
-            "num_inference_steps": steps_per_batch,
-            "num_images_per_prompt": int(state['image_batch_size']),
-            "generator": generator,
-            "callback_on_step_end": interrupt_callback,
-        }
-
-        cfg_val = state.get('image_cfg_scale', 0.0)
-        if pipeline_type == 'qwenimage':
-            gen_kwargs["true_cfg_scale"] = cfg_val
-        else:
-            gen_kwargs["guidance_scale"] = cfg_val
-
-        t0 = time.time()
-
-        for batch_idx in range(batch_count):
-            if shared.stop_everything:
-                break
-
-            generator.manual_seed(int(seed + batch_idx))
-
-            # Generate prompt variation if enabled
-            if state['image_llm_variations']:
-                gen_kwargs["prompt"] = generate_prompt_variation(state)
-
-            # Run generation in thread so we can yield progress
-            result_holder = []
-            error_holder = []
-
-            def run_batch():
-                try:
-                    # Apply magic suffix only at generation time for qwenimage
-                    clean_prompt = gen_kwargs["prompt"]
-                    if pipeline_type == 'qwenimage':
-                        magic_suffix = ", Ultra HD, 4K, cinematic composition"
-                        if magic_suffix.strip(", ") not in clean_prompt:
-                            gen_kwargs["prompt"] = clean_prompt + magic_suffix
-
-                    result_holder.extend(shared.image_model(**gen_kwargs).images)
-                    gen_kwargs["prompt"] = clean_prompt  # restore
-                except Exception as e:
-                    error_holder.append(e)
-
-            thread = threading.Thread(target=run_batch)
-            thread.start()
-
-            # Yield progress updates while generation runs
-            while thread.is_alive():
-                try:
-                    step = progress_queue.get(timeout=0.1)
-                    absolute_step = batch_idx * steps_per_batch + step
-                    pct = absolute_step / total_steps
-                    text = f"Batch {batch_idx + 1}/{batch_count} — Step {step}/{steps_per_batch}"
-                    yield all_images, progress_bar_html(pct, text)
-                except queue.Empty:
-                    pass
-
-            thread.join()
-
-            if error_holder:
-                raise error_holder[0]
-
-            # Save this batch's images with the actual prompt and seed used
-            if save_images:
-                batch_seed = seed + batch_idx
-                original_prompt = state['image_prompt']
-                state['image_prompt'] = gen_kwargs["prompt"]
-                saved_paths = save_generated_images(result_holder, state, batch_seed)
-                state['image_prompt'] = original_prompt
-                # Use file paths so gallery serves actual PNGs with metadata
-                all_images.extend(saved_paths)
-            else:
-                # Fallback to PIL objects if not saving
-                all_images.extend(result_holder)
-
-            yield all_images, progress_bar_html((batch_idx + 1) / batch_count, f"Batch {batch_idx + 1}/{batch_count} complete")
-
-        t1 = time.time()
-
-        total_images = batch_count * int(state['image_batch_size'])
-        logger.info(f'Generated {total_images} {"image" if total_images == 1 else "images"} in {(t1 - t0):.2f} seconds ({total_steps / (t1 - t0):.2f} steps/s, seed {seed})')
-
-        yield all_images, progress_bar_html()
-        clear_torch_cache()
-
-    except Exception as e:
-        logger.error(f"Image generation failed: {e}")
-        traceback.print_exc()
-        yield [], progress_bar_html()
-        clear_torch_cache()
-
-
-def load_image_model_wrapper(model_name, dtype, attn_backend, cpu_offload, compile_model, quant_method):
-    if not model_name or model_name == 'None':
-        yield "No model selected"
-        return
-
-    try:
-        yield f"Loading `{model_name}`..."
-        unload_image_model()
-
-        result = load_image_model(
-            model_name,
-            dtype=dtype,
-            attn_backend=attn_backend,
-            cpu_offload=cpu_offload,
-            compile_model=compile_model,
-            quant_method=quant_method
-        )
-
-        if result is not None:
-            shared.image_model_name = model_name
-            yield f"✓ Loaded **{model_name}** (quantization: {quant_method})"
-        else:
-            yield f"✗ Failed to load `{model_name}`"
-    except Exception:
-        yield f"Error:\n```\n{traceback.format_exc()}\n```"
-
-
-def unload_image_model_wrapper():
-    previous_name = shared.image_model_name
-    unload_image_model()
-    if previous_name != 'None':
-        return f"Model: **{previous_name}** (unloaded)"
-    return "No model loaded"
-
-
-def download_image_model_wrapper(model_path):
-    from huggingface_hub import snapshot_download
-
-    if not model_path:
-        yield "No model specified", gr.update()
-        return
-
-    try:
-        model_path = model_path.strip()
-        if model_path.startswith('https://huggingface.co/'):
-            model_path = model_path[len('https://huggingface.co/'):]
-        elif model_path.startswith('huggingface.co/'):
-            model_path = model_path[len('huggingface.co/'):]
-
-        if ':' in model_path:
-            model_id, branch = model_path.rsplit(':', 1)
-        else:
-            model_id, branch = model_path, 'main'
-
-        folder_name = model_id.replace('/', '_')
-        output_folder = Path(shared.args.image_model_dir) / folder_name
-
-        yield f"Downloading `{model_id}` (branch: {branch})...", gr.update()
-
-        snapshot_download(
-            repo_id=model_id,
-            revision=branch,
-            local_dir=output_folder,
-            local_dir_use_symlinks=False,
-        )
-
-        new_choices = utils.get_available_image_models()
-        yield f"✓ Downloaded to `{output_folder}`", gr.update(choices=new_choices, value=folder_name)
-    except Exception:
-        yield f"Error:\n```\n{traceback.format_exc()}\n```", gr.update()
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -50,7 +50,6 @@ def create_ui():

                        with gr.Column():
                            shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
-                            shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
@ -84,7 +83,6 @@ def create_ui():
                                shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
                                shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                                shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
-                                shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
                                shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                                shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
                                shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
@ -96,7 +94,7 @@ def create_ui():
                                shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')

                            with gr.Column():
-                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='Use PyTorch in CPU mode.')
+                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
                                shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                                shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
                                shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
--- a/modules/utils.py
+++ b/modules/utils.py
@ -86,7 +86,7 @@ def check_model_loaded():
    return True, None


-def resolve_model_path(model_name_or_path, image_model=False):
+def resolve_model_path(model_name_or_path):
    """
    Resolves a model path, checking for a direct path
    before the default models directory.
@ -95,8 +95,6 @@ def resolve_model_path(model_name_or_path, image_model=False):
    path_candidate = Path(model_name_or_path)
    if path_candidate.exists():
        return path_candidate
-    elif image_model:
-        return Path(f'{shared.args.image_model_dir}/{model_name_or_path}')
    else:
        return Path(f'{shared.args.model_dir}/{model_name_or_path}')

@ -155,24 +153,6 @@ def get_available_models():
    return filtered_gguf_files + model_dirs


-def get_available_image_models():
-    model_dir = Path(shared.args.image_model_dir)
-    model_dir.mkdir(parents=True, exist_ok=True)
-
-    # Find valid model directories
-    model_dirs = []
-    for item in os.listdir(model_dir):
-        item_path = model_dir / item
-        if not item_path.is_dir():
-            continue
-
-        model_dirs.append(item)
-
-    model_dirs = sorted(model_dirs, key=natural_keys)
-
-    return model_dirs
-
-
 def get_available_ggufs():
    model_list = []
    model_dir = Path(shared.args.model_dir)
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@ -3,17 +3,15 @@ audioop-lts<1.0; python_version >= "3.13"
 bitsandbytes==0.48.*
 colorama
 datasets
-diffusers==0.36.*
 einops
 fastapi==0.112.4
-flash-linear-attention==0.4.0
+flash-linear-attention==0.3.2
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.18.*
+peft==0.17.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -22,13 +20,12 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.7.*
+safetensors==0.6.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
 transformers==4.57.*
-triton-windows==3.5.1.post22; platform_system == "Windows"
+triton-windows==3.5.0.post21; platform_system == "Windows"
 tqdm
 wandb

@ -42,10 +39,10 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.18/exllamav3-0.0.18+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.18/exllamav3-0.0.18+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@ -2,16 +2,14 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
-diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.18.*
+peft==0.17.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -20,13 +18,12 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.7.*
+safetensors==0.6.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
 transformers==4.57.*
-triton-windows==3.5.1.post22; platform_system == "Windows"
+triton-windows==3.5.0.post21; platform_system == "Windows"
 tqdm
 wandb

@ -40,7 +37,7 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@ -2,16 +2,14 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
-diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.18.*
+peft==0.17.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -20,13 +18,12 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.7.*
+safetensors==0.6.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
 transformers==4.57.*
-triton-windows==3.5.1.post22; platform_system == "Windows"
+triton-windows==3.5.0.post21; platform_system == "Windows"
 tqdm
 wandb

@ -40,7 +37,7 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@ -2,16 +2,14 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
-diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.18.*
+peft==0.17.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -20,13 +18,12 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.7.*
+safetensors==0.6.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
 transformers==4.57.*
-triton-windows==3.5.1.post22; platform_system == "Windows"
+triton-windows==3.5.0.post21; platform_system == "Windows"
 tqdm
 wandb

@ -40,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@ -2,16 +2,14 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
-diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.18.*
+peft==0.17.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -20,13 +18,12 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.7.*
+safetensors==0.6.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
 transformers==4.57.*
-triton-windows==3.5.1.post22; platform_system == "Windows"
+triton-windows==3.5.0.post21; platform_system == "Windows"
 tqdm
 wandb

@ -40,5 +37,6 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@ -2,16 +2,14 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
-diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.18.*
+peft==0.17.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -20,13 +18,12 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.7.*
+safetensors==0.6.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
 transformers==4.57.*
-triton-windows==3.5.1.post22; platform_system == "Windows"
+triton-windows==3.5.0.post21; platform_system == "Windows"
 tqdm
 wandb

@ -40,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@ -2,16 +2,14 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
-diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.18.*
+peft==0.17.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -20,13 +18,12 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.7.*
+safetensors==0.6.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
 transformers==4.57.*
-triton-windows==3.5.1.post22; platform_system == "Windows"
+triton-windows==3.5.0.post21; platform_system == "Windows"
 tqdm
 wandb

@ -40,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@ -3,17 +3,15 @@ audioop-lts<1.0; python_version >= "3.13"
 bitsandbytes==0.48.*
 colorama
 datasets
-diffusers==0.36.*
 einops
 fastapi==0.112.4
-flash-linear-attention==0.4.0
+flash-linear-attention==0.3.2
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.18.*
+peft==0.17.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -22,13 +20,12 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.7.*
+safetensors==0.6.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
 transformers==4.57.*
-triton-windows==3.5.1.post22; platform_system == "Windows"
+triton-windows==3.5.0.post21; platform_system == "Windows"
 tqdm
 wandb

@ -42,10 +39,10 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.18/exllamav3-0.0.18+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.18/exllamav3-0.0.18+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.11/exllamav3-0.0.11+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@ -2,16 +2,14 @@ accelerate==1.8.*
 audioop-lts<1.0; python_version >= "3.13"
 colorama
 datasets
-diffusers==0.36.*
 einops
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
 pandas
-peft==0.18.*
+peft==0.17.*
 Pillow>=9.5.0
 psutil
 pydantic==2.11.0
@ -20,13 +18,12 @@ python-docx==1.1.2
 pyyaml
 requests
 rich
-safetensors==0.7.*
+safetensors==0.6.*
 scipy
 sentencepiece
 tensorboard
-torchao==0.14.*
 transformers==4.57.*
-triton-windows==3.5.1.post22; platform_system == "Windows"
+triton-windows==3.5.0.post21; platform_system == "Windows"
 tqdm
 wandb

--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@ -1,7 +1,6 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -23,5 +22,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@ -1,27 +0,0 @@
-audioop-lts<1.0; python_version >= "3.13"
-fastapi==0.112.4
-html2text==2025.4.15
-huggingface-hub==0.36.0
-jinja2==3.1.6
-markdown
-numpy==2.2.*
-pydantic==2.11.0
-PyPDF2==3.0.1
-python-docx==1.1.2
-pyyaml
-requests
-rich
-tqdm
-
-# Gradio
-gradio==4.37.*
-https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
-
-# API
-flask_cloudflared==0.0.14
-sse-starlette==1.6.5
-tiktoken
-
-# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@ -1,27 +0,0 @@
-audioop-lts<1.0; python_version >= "3.13"
-fastapi==0.112.4
-html2text==2025.4.15
-huggingface-hub==0.36.0
-jinja2==3.1.6
-markdown
-numpy==2.2.*
-pydantic==2.11.0
-PyPDF2==3.0.1
-python-docx==1.1.2
-pyyaml
-requests
-rich
-tqdm
-
-# Gradio
-gradio==4.37.*
-https://github.com/oobabooga/gradio/releases/download/custom-build/gradio_client-1.0.2+custom.1-py3-none-any.whl
-
-# API
-flask_cloudflared==0.0.14
-sse-starlette==1.6.5
-tiktoken
-
-# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@ -1,7 +1,6 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -23,5 +22,6 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@ -1,7 +1,6 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -23,5 +22,6 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@ -1,7 +1,6 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -23,5 +22,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@ -1,7 +1,6 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -23,5 +22,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@ -1,7 +1,6 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -23,5 +22,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@ -1,7 +1,6 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@ -1,7 +1,6 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -22,6 +21,6 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken

-# Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@ -1,7 +1,6 @@
 audioop-lts<1.0; python_version >= "3.13"
 fastapi==0.112.4
 html2text==2025.4.15
-huggingface-hub==0.36.0
 jinja2==3.1.6
 markdown
 numpy==2.2.*
@ -23,5 +22,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.71.0/llama_cpp_binaries-0.71.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.56.0/llama_cpp_binaries-0.56.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/server.py
+++ b/server.py
@ -5,7 +5,6 @@ from pathlib import Path

 from modules import shared
 from modules.block_requests import OpenMonkeyPatch, RequestBlocker
-from modules.image_models import load_image_model
 from modules.logging_colors import logger
 from modules.prompts import load_prompt

@ -51,7 +50,6 @@ from modules import (
    ui_chat,
    ui_default,
    ui_file_saving,
-    ui_image_generation,
    ui_model_menu,
    ui_notebook,
    ui_parameters,
@ -101,11 +99,6 @@ def create_interface():
            auth.extend(x.strip() for line in file for x in line.split(',') if x.strip())
    auth = [tuple(cred.split(':')) for cred in auth]

-    # Allowed paths
-    allowed_paths = ["css", "js", "extensions", "user_data/cache"]
-    if not shared.args.multi_user:
-        allowed_paths.append("user_data/image_outputs")
-
    # Import the extensions and execute their setup() functions
    if shared.args.extensions is not None and len(shared.args.extensions) > 0:
        extensions_module.load_extensions()
@ -170,7 +163,6 @@ def create_interface():
        ui_chat.create_character_settings_ui()  # Character tab
        ui_model_menu.create_ui()  # Model tab
        if not shared.args.portable:
-            ui_image_generation.create_ui()  # Image generation tab
            training.create_ui()  # Training tab
        ui_session.create_ui()  # Session tab

@ -178,8 +170,6 @@ def create_interface():
        ui_chat.create_event_handlers()
        ui_default.create_event_handlers()
        ui_notebook.create_event_handlers()
-        if not shared.args.portable:
-            ui_image_generation.create_event_handlers()

        # Other events
        ui_file_saving.create_event_handlers()
@ -242,7 +232,7 @@ def create_interface():
            ssl_keyfile=shared.args.ssl_keyfile,
            ssl_certfile=shared.args.ssl_certfile,
            root_path=shared.args.subpath,
-            allowed_paths=allowed_paths,
+            allowed_paths=["css", "js", "extensions", "user_data/cache"]
        )


@ -266,9 +256,6 @@ if __name__ == "__main__":
        if new_settings:
            shared.settings.update(new_settings)

-    # Apply CLI overrides for image model settings (CLI flags take precedence over saved settings)
-    shared.apply_image_model_cli_overrides()
-
    # Fallback settings for models
    shared.model_config['.*'] = get_fallback_settings()
    shared.model_config.move_to_end('.*', last=False)  # Move to the beginning
@ -280,22 +267,6 @@ if __name__ == "__main__":
        if extension not in shared.args.extensions:
            shared.args.extensions.append(extension)

-    # Load image model if specified via CLI
-    if shared.args.image_model:
-        logger.info(f"Loading image model: {shared.args.image_model}")
-        result = load_image_model(
-            shared.args.image_model,
-            dtype=shared.settings.get('image_dtype', 'bfloat16'),
-            attn_backend=shared.settings.get('image_attn_backend', 'sdpa'),
-            cpu_offload=shared.settings.get('image_cpu_offload', False),
-            compile_model=shared.settings.get('image_compile', False),
-            quant_method=shared.settings.get('image_quant', 'none')
-        )
-        if result is not None:
-            shared.image_model_name = shared.args.image_model
-        else:
-            logger.error(f"Failed to load image model: {shared.args.image_model}")
-
    available_models = utils.get_available_models()

    # Model defined through --model
--- a/user_data/image_models/place-your-models-here.txt
+++ b/user_data/image_models/place-your-models-here.txt