Merge pull request #7452 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2026-04-02 22:18:46 -03:00 committed by GitHub
commit ae699ac570
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
50 changed files with 1160 additions and 651 deletions

View file

@ -68,3 +68,31 @@ jobs:
with:
version: ${{ inputs.version }}
config: 'os:macos-15-intel,macos-14'
build_release_ik_cuda_windows:
name: ik CUDA Windows
uses: ./.github/workflows/build-portable-release-ik-cuda.yml
with:
version: ${{ inputs.version }}
config: 'os:windows-2022'
build_release_ik_cuda_linux:
name: ik CUDA Linux
uses: ./.github/workflows/build-portable-release-ik-cuda.yml
with:
version: ${{ inputs.version }}
config: 'os:ubuntu-22.04'
build_release_ik_cpu_windows:
name: ik CPU Windows
uses: ./.github/workflows/build-portable-release-ik.yml
with:
version: ${{ inputs.version }}
config: 'os:windows-2022'
build_release_ik_cpu_linux:
name: ik CPU Linux
uses: ./.github/workflows/build-portable-release-ik.yml
with:
version: ${{ inputs.version }}
config: 'os:ubuntu-22.04'

View file

@ -0,0 +1,178 @@
name: Build ik CUDA
on:
workflow_dispatch:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
workflow_call:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
permissions:
contents: write
jobs:
define_matrix:
name: Define Build Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
defaults:
run:
shell: pwsh
env:
CONFIGIN: ${{ inputs.config }}
EXCLUDEIN: ${{ inputs.exclude }}
steps:
- name: Define Job Output
id: set-matrix
run: |
$matrix = @{
'os' = @('ubuntu-22.04', 'windows-2022')
'pyver' = @("3.13")
'cuda' = @("12.4", "13.1")
}
if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
if ($env:EXCLUDEIN -ne 'None') {
$exclusions = @()
$exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
$matrix['exclude'] = $exclusions
}
$matrixOut = ConvertTo-Json $matrix -Compress
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
build_wheels:
name: ${{ matrix.os }} ${{ matrix.pyver }} CUDA ${{ matrix.cuda }}
needs: define_matrix
runs-on: ${{ matrix.os }}
strategy:
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
defaults:
run:
shell: pwsh
env:
PCKGVER: ${{ inputs.version }}
steps:
- uses: actions/checkout@v6
with:
repository: 'oobabooga/text-generation-webui'
ref: ${{ inputs.version }}
submodules: 'recursive'
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.pyver }}
- name: Build Package
shell: bash
run: |
VERSION_CLEAN="${{ inputs.version }}"
VERSION_CLEAN="${VERSION_CLEAN#v}"
cd ..
cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
cd "text-generation-webui-${VERSION_CLEAN}"
# Remove extensions that need additional requirements
allowed=("character_bias" "gallery" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
CUDA_VERSION="${{ matrix.cuda }}"
VERSION="${{ inputs.version }}"
# 1. Set platform-specific variables
if [[ "$RUNNER_OS" == "Windows" ]]; then
PLATFORM="windows"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
PIP_PATH="portable_env/python.exe -m pip"
PACKAGES_PATH="portable_env/Lib/site-packages"
rm start_linux.sh start_macos.sh
else
PLATFORM="linux"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
PIP_PATH="portable_env/bin/python -m pip"
PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
rm start_macos.sh start_windows.bat
fi
# 2. Download and extract Python
cd ..
echo "Downloading Python for $PLATFORM..."
curl -L -o python-build.tar.gz "$PYTHON_URL"
tar -xzf python-build.tar.gz
mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
# 3. Prepare requirements file based on CUDA version
cd "text-generation-webui-${VERSION_CLEAN}"
if [[ "$CUDA_VERSION" == "13.1" ]]; then
REQ_FILE="requirements/portable/requirements_ik_cuda131.txt"
else
REQ_FILE="requirements/portable/requirements_ik.txt"
fi
# 4. Inject --ik into start scripts
sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
# 5. Install packages
echo "Installing Python packages from $REQ_FILE..."
$PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
# 6. Clean up
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
# 7. Create archive
cd ..
if [[ "$RUNNER_OS" == "Windows" ]]; then
ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
echo "Creating archive: $ARCHIVE_NAME"
powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
else
ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.tar.gz"
echo "Creating archive: $ARCHIVE_NAME"
tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
fi
- name: Upload files to a GitHub release
id: upload-release
uses: svenstaro/upload-release-action@2.7.0
continue-on-error: true
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: ../textgen-portable-ik-*
tag: ${{ inputs.version }}
file_glob: true
make_latest: false
overwrite: true

View file

@ -0,0 +1,173 @@
name: Build ik CPU
on:
workflow_dispatch:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
workflow_call:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
permissions:
contents: write
jobs:
define_matrix:
name: Define Build Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
defaults:
run:
shell: pwsh
env:
CONFIGIN: ${{ inputs.config }}
EXCLUDEIN: ${{ inputs.exclude }}
steps:
- name: Define Job Output
id: set-matrix
run: |
$matrix = @{
'os' = @('ubuntu-22.04', 'windows-2022')
'pyver' = @("3.13")
}
if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
if ($env:EXCLUDEIN -ne 'None') {
$exclusions = @()
$exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
$matrix['exclude'] = $exclusions
}
$matrixOut = ConvertTo-Json $matrix -Compress
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
build_wheels:
name: ${{ matrix.os }} ${{ matrix.pyver }}
needs: define_matrix
runs-on: ${{ matrix.os }}
strategy:
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
defaults:
run:
shell: pwsh
env:
PCKGVER: ${{ inputs.version }}
steps:
- uses: actions/checkout@v6
with:
repository: 'oobabooga/text-generation-webui'
ref: ${{ inputs.version }}
submodules: 'recursive'
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.pyver }}
- name: Build Package
shell: bash
run: |
VERSION_CLEAN="${{ inputs.version }}"
VERSION_CLEAN="${VERSION_CLEAN#v}"
cd ..
cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
cd "text-generation-webui-${VERSION_CLEAN}"
# Remove extensions that need additional requirements
allowed=("character_bias" "gallery" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
VERSION="${{ inputs.version }}"
# 1. Set platform-specific variables
if [[ "$RUNNER_OS" == "Windows" ]]; then
PLATFORM="windows-cpu"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
PIP_PATH="portable_env/python.exe -m pip"
PACKAGES_PATH="portable_env/Lib/site-packages"
rm start_linux.sh start_macos.sh
else
PLATFORM="linux-cpu"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
PIP_PATH="portable_env/bin/python -m pip"
PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
rm start_macos.sh start_windows.bat
fi
# 2. Download and extract Python
echo "Downloading Python for $PLATFORM..."
cd ..
curl -L -o python-build.tar.gz "$PYTHON_URL"
tar -xzf python-build.tar.gz
mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
# 3. Prepare requirements file
cd "text-generation-webui-${VERSION_CLEAN}"
REQ_FILE="requirements/portable/requirements_ik_cpu_only.txt"
echo "Using requirements file: $REQ_FILE"
# 4. Inject --ik into start scripts
sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
# 5. Install packages
echo "Installing Python packages from $REQ_FILE..."
$PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
# 6. Clean up
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
# 7. Create archive
cd ..
if [[ "$RUNNER_OS" == "Windows" ]]; then
ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.zip"
echo "Creating archive: $ARCHIVE_NAME"
powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
else
ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.tar.gz"
echo "Creating archive: $ARCHIVE_NAME"
tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
fi
- name: Upload files to a GitHub release
id: upload-release
uses: svenstaro/upload-release-action@2.7.0
continue-on-error: true
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: ../textgen-portable-ik-*
tag: ${{ inputs.version }}
file_glob: true
make_latest: false
overwrite: true

View file

@ -112,7 +112,7 @@ Used for talking to an instruction-following model using the prompt format defin
The prompt format is defined by the **Instruction template** parameter in "Parameters" > "Instruction template", which represents a Jinja2 template.
Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any), and will update the values under "Parameters" > "Instruction template" accordingly. This is done using a set of regular expressions defined in `user_data/models/config.yaml`. This detection is not guaranteed to be accurate. You should check the model card on Hugging Face to see if you are using the correct prompt format.
Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any) from the model metadata (e.g. `tokenizer_config.json` or GGUF metadata), and will update the values under "Parameters" > "Instruction template" accordingly. You should check the model card on Hugging Face to see if you are using the correct prompt format.
### Chat-instruct

View file

@ -39,7 +39,7 @@ curl http://127.0.0.1:5000/v1/completions \
#### Chat completions
Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be guessed automatically based on the model name using the regex patterns in `user_data/models/config.yaml`.
Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be detected automatically from the model metadata.
```shell
curl http://127.0.0.1:5000/v1/chat/completions \

View file

@ -158,28 +158,21 @@ class ModelDownloader:
# Also if GGUF and safetensors are available, download only safetensors
if (has_pytorch or has_pt or has_gguf) and has_safetensors:
has_gguf = False
for i in range(len(classifications) - 1, -1, -1):
if classifications[i] in ['pytorch', 'pt', 'gguf']:
links.pop(i)
file_sizes.pop(i)
keep = [i for i, c in enumerate(classifications) if c not in ['pytorch', 'pt', 'gguf']]
links = [links[i] for i in keep]
file_sizes = [file_sizes[i] for i in keep]
# For GGUF, try to download only the Q4_K_M if no specific file is specified.
if has_gguf and specific_file is None:
has_q4km = False
for i in range(len(classifications) - 1, -1, -1):
if 'q4_k_m' in links[i].lower():
has_q4km = True
has_q4km = any('q4_k_m' in link.lower() for link in links)
if has_q4km:
for i in range(len(classifications) - 1, -1, -1):
if 'q4_k_m' not in links[i].lower():
links.pop(i)
file_sizes.pop(i)
keep = [i for i, link in enumerate(links) if 'q4_k_m' in link.lower()]
else:
for i in range(len(classifications) - 1, -1, -1):
if links[i].lower().endswith('.gguf'):
links.pop(i)
file_sizes.pop(i)
keep = [i for i, link in enumerate(links) if not link.lower().endswith('.gguf')]
links = [links[i] for i in keep]
file_sizes = [file_sizes[i] for i in keep]
is_llamacpp = has_gguf and specific_file is not None
return links, sha256, is_lora, is_llamacpp, file_sizes

View file

@ -2,8 +2,11 @@ import concurrent.futures
import requests
from modules.web_search import _validate_url
def download_single(url):
_validate_url(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

View file

@ -5,12 +5,14 @@ import requests
from bs4 import BeautifulSoup
import extensions.superboogav2.parameters as parameters
from modules.web_search import _validate_url
from .data_processor import process_and_add_to_collector
from .utils import create_metadata_source
def _download_single(url):
_validate_url(url)
response = requests.get(url, timeout=5)
if response.status_code == 200:
return response.content

View file

@ -1,6 +1,6 @@
function toggleDarkMode() {
document.body.classList.toggle("dark");
var currentCSS = document.getElementById("highlight-css");
const currentCSS = document.getElementById("highlight-css");
if (currentCSS.getAttribute("href") === "file/css/highlightjs/github-dark.min.css") {
currentCSS.setAttribute("href", "file/css/highlightjs/github.min.css");
} else {
@ -9,12 +9,10 @@ function toggleDarkMode() {
// Re-highlight all code blocks once stylesheet loads
currentCSS.onload = function() {
const messageBodies = document.getElementById("chat").querySelectorAll(".message-body");
messageBodies.forEach((messageBody) => {
const codeBlocks = messageBody.querySelectorAll("pre code");
codeBlocks.forEach((codeBlock) => {
hljs.highlightElement(codeBlock);
});
// Clear data-highlighted so hljs will re-process with the new theme
document.querySelectorAll("#chat .message-body pre code[data-highlighted]").forEach((codeBlock) => {
delete codeBlock.dataset.highlighted;
});
doSyntaxHighlighting();
};
}

View file

@ -1,11 +1,35 @@
// -------------------------------------------------
// Shared helpers
// -------------------------------------------------
function getProfilePictureUrl() {
return "/file/user_data/cache/pfp_character.png?time=" + Date.now();
}
const MESSAGE_SELECTOR = ".message, .user-message, .assistant-message";
function getMessageElement(element) {
if (!element) return null;
return element.closest(MESSAGE_SELECTOR);
}
function isUserRole(messageElement) {
return messageElement.classList.contains("user-message") ||
messageElement.querySelector(".text-you") !== null ||
messageElement.querySelector(".circle-you") !== null;
}
// Trigger a synthetic 'input' event so Gradio picks up programmatic value changes
function dispatchGradioInput(element) {
element.dispatchEvent(new Event("input", { bubbles: true }));
}
// -------------------------------------------------
// Event handlers
// -------------------------------------------------
function copyToClipboard(element) {
if (!element) return;
const messageElement = element.closest(".message, .user-message, .assistant-message");
const messageElement = getMessageElement(element);
if (!messageElement) return;
const rawText = messageElement.getAttribute("data-raw");
@ -48,9 +72,7 @@ function fallbackCopyToClipboard(text) {
}
function branchHere(element) {
if (!element) return;
const messageElement = element.closest(".message, .user-message, .assistant-message");
const messageElement = getMessageElement(element);
if (!messageElement) return;
const index = messageElement.getAttribute("data-index");
@ -69,11 +91,7 @@ function branchHere(element) {
}
branchIndexInput.value = index;
// Trigger any 'change' or 'input' events Gradio might be listening for
const event = new Event("input", { bubbles: true });
branchIndexInput.dispatchEvent(event);
dispatchGradioInput(branchIndexInput);
branchButton.click();
}
@ -82,9 +100,7 @@ function branchHere(element) {
// -------------------------------------------------
function editHere(buttonElement) {
if (!buttonElement) return;
const messageElement = buttonElement.closest(".message, .user-message, .assistant-message");
const messageElement = getMessageElement(buttonElement);
if (!messageElement) return;
const messageBody = messageElement.querySelector(".message-body");
@ -97,12 +113,7 @@ function editHere(buttonElement) {
return;
}
// Determine role based on message element - handle different chat modes
const isUserMessage = messageElement.classList.contains("user-message") ||
messageElement.querySelector(".text-you") !== null ||
messageElement.querySelector(".circle-you") !== null;
startEditing(messageElement, messageBody, isUserMessage);
startEditing(messageElement, messageBody, isUserRole(messageElement));
}
function startEditing(messageElement, messageBody, isUserMessage) {
@ -209,30 +220,22 @@ function submitMessageEdit(index, newText, isUserMessage) {
editTextInput.value = newText;
editRoleInput.value = isUserMessage ? "user" : "assistant";
editIndexInput.dispatchEvent(new Event("input", { bubbles: true }));
editTextInput.dispatchEvent(new Event("input", { bubbles: true }));
editRoleInput.dispatchEvent(new Event("input", { bubbles: true }));
dispatchGradioInput(editIndexInput);
dispatchGradioInput(editTextInput);
dispatchGradioInput(editRoleInput);
editButton.click();
return true;
}
function navigateVersion(element, direction) {
if (!element) return;
const messageElement = element.closest(".message, .user-message, .assistant-message");
const messageElement = getMessageElement(element);
if (!messageElement) return;
const index = messageElement.getAttribute("data-index");
if (!index) return;
// Determine role based on message element classes
let role = "assistant"; // Default role
if (messageElement.classList.contains("user-message") ||
messageElement.querySelector(".text-you") ||
messageElement.querySelector(".circle-you")) {
role = "user";
}
const role = isUserRole(messageElement) ? "user" : "assistant";
const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
@ -248,11 +251,9 @@ function navigateVersion(element, direction) {
directionInput.value = direction;
roleInput.value = role;
// Trigger 'input' events for Gradio to pick up changes
const event = new Event("input", { bubbles: true });
indexInput.dispatchEvent(event);
directionInput.dispatchEvent(event);
roleInput.dispatchEvent(event);
dispatchGradioInput(indexInput);
dispatchGradioInput(directionInput);
dispatchGradioInput(roleInput);
navigateButton.click();
}
@ -313,7 +314,7 @@ function handleMorphdomUpdate(data) {
function applyMorphdomUpdate(data) {
// Determine target element and use it as query scope
var target_element, target_html;
let target_element, target_html;
if (data.last_message_only) {
const childNodes = document.getElementsByClassName("messages")[0].childNodes;
target_element = childNodes[childNodes.length - 1];

View file

@ -4,8 +4,9 @@
// Sync highlight.js theme with the actual Gradio theme
var defined_hljs_css = document.body.classList.contains("dark") ? "file/css/highlightjs/github-dark.min.css" : "file/css/highlightjs/github.min.css";
if (document.getElementById("highlight-css").getAttribute("href") !== defined_hljs_css) {
document.getElementById("highlight-css").setAttribute("href", defined_hljs_css);
var hljsCssElement = document.getElementById("highlight-css");
if (hljsCssElement.getAttribute("href") !== defined_hljs_css) {
hljsCssElement.setAttribute("href", defined_hljs_css);
}
let main_parent = document.getElementById("chat-tab").parentNode;
@ -49,21 +50,18 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
//------------------------------------------------
// --- Helper functions --- //
function isModifiedKeyboardEvent() {
return (event instanceof KeyboardEvent &&
event.shiftKey ||
event.ctrlKey ||
event.altKey ||
event.metaKey);
function isModifiedKeyboardEvent(event) {
return event instanceof KeyboardEvent &&
(event.shiftKey || event.ctrlKey || event.altKey || event.metaKey);
}
function isFocusedOnEditableTextbox() {
function isFocusedOnEditableTextbox(event) {
if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") {
return !!event.target.value;
}
return false;
}
let previousTabId = "chat-tab-button";
document.addEventListener("keydown", function(event) {
// Stop generation on Esc pressed
if (event.key === "Escape") {
@ -117,14 +115,14 @@ document.addEventListener("keydown", function(event) {
}
// --- Simple version navigation --- //
if (!isFocusedOnEditableTextbox()) {
if (!isFocusedOnEditableTextbox(event)) {
// Version navigation on Arrow keys (horizontal)
if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") {
if (!isModifiedKeyboardEvent(event) && event.key === "ArrowLeft") {
event.preventDefault();
navigateLastAssistantMessage("left");
}
else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") {
else if (!isModifiedKeyboardEvent(event) && event.key === "ArrowRight") {
event.preventDefault();
if (!navigateLastAssistantMessage("right")) {
// If can't navigate right (last version), regenerate
@ -159,9 +157,8 @@ targetElement.addEventListener("scroll", function() {
let diff = targetElement.scrollHeight - targetElement.clientHeight;
let isAtBottomNow = Math.abs(targetElement.scrollTop - diff) <= 10 || diff <= 0;
// Add scrolling class to disable hover effects
if (window.isScrolled || !isAtBottomNow) {
targetElement.classList.add("scrolling");
targetElement.classList.add("scrolling"); // Disables hover effects during scroll
}
if(isAtBottomNow) {
@ -202,12 +199,8 @@ const observer = new MutationObserver(function() {
});
// Only watch for attribute changes on targetElement (e.g. _generating class)
const config = {
attributes: true
};
// Start observing the target element
observer.observe(targetElement, config);
observer.observe(targetElement, { attributes: true });
//------------------------------------------------
// Handle syntax highlighting / LaTeX
@ -228,7 +221,7 @@ window.doSyntaxHighlighting = function() {
if (messageBodies.length > 0) {
let hasSeenVisible = false;
// Go from last message to first
// Go from last message to first so we can early-exit once past visible area
for (let i = messageBodies.length - 1; i >= 0; i--) {
const messageBody = messageBodies[i];
@ -243,8 +236,8 @@ window.doSyntaxHighlighting = function() {
codeBlock.classList.add("pretty_scrollbar");
});
// Only render math in visible elements
const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
// Only render math in individually visible containers (the outer check is on the message body)
mathContainers.forEach(container => {
if (isElementVisibleOnScreen(container)) {
renderMathInElement(container, {
@ -271,7 +264,7 @@ const doSyntaxHighlighting = window.doSyntaxHighlighting;
// Add some scrollbars
//------------------------------------------------
const scrollbarElements = document.querySelectorAll(".add_scrollbar textarea, .add_scrollbar .drag-drop-list");
for(i = 0; i < scrollbarElements.length; i++) {
for(let i = 0; i < scrollbarElements.length; i++) {
scrollbarElements[i].classList.remove("scroll-hide");
scrollbarElements[i].classList.add("pretty_scrollbar");
scrollbarElements[i].style.resize = "none";
@ -298,13 +291,13 @@ if (toolsInfo) {
// Remove some backgrounds
//------------------------------------------------
const noBackgroundelements = document.querySelectorAll(".no-background");
for(i = 0; i < noBackgroundelements.length; i++) {
for(let i = 0; i < noBackgroundelements.length; i++) {
noBackgroundelements[i].parentNode.style.border = "none";
noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = "center";
}
const slimDropdownElements = document.querySelectorAll(".slim-dropdown");
for (i = 0; i < slimDropdownElements.length; i++) {
for (let i = 0; i < slimDropdownElements.length; i++) {
const parentNode = slimDropdownElements[i].parentNode;
parentNode.style.background = "transparent";
parentNode.style.border = "0";
@ -374,49 +367,43 @@ button.addEventListener("click", function () {
}
});
// Add event listener for mouseleave on the button
button.addEventListener("mouseleave", function () {
// Delay to prevent menu hiding when the mouse leaves the button into the menu
// Delay to prevent menu hiding when the mouse leaves the button or menu
function delayedHideMenu() {
setTimeout(function () {
if (!isMouseOverButtonOrMenu()) {
hideMenu();
}
}, 100);
});
}
// Add event listener for mouseleave on the button
button.addEventListener("mouseleave", delayedHideMenu);
// Add event listener for mouseleave on the menu
menu.addEventListener("mouseleave", function () {
// Delay to prevent menu hide when the mouse leaves the menu into the button
setTimeout(function () {
if (!isMouseOverButtonOrMenu()) {
hideMenu();
}
}, 100);
});
menu.addEventListener("mouseleave", delayedHideMenu);
// Add event listener for click anywhere in the document
document.addEventListener("click", function (event) {
const target = event.target;
// Check if the click is outside the button/menu and the menu is visible
if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") {
hideMenu();
}
if (event.target.classList.contains("pfp_character")) {
const target = event.target;
if (target.classList.contains("pfp_character")) {
toggleBigPicture();
}
// Handle sidebar clicks on mobile
if (isMobile()) {
// Check if the click did NOT originate from any of the specified toggle buttons or elements
// Check if the click did NOT originate from any of the specified toggle buttons or elements
if (
target.closest("#navigation-toggle") !== navigationToggle &&
target.closest("#past-chats-toggle") !== pastChatsToggle &&
target.closest("#chat-controls-toggle") !== chatControlsToggle &&
target.closest(".header_bar") !== headerBar &&
target.closest("#past-chats-row") !== pastChatsRow &&
target.closest("#chat-controls") !== chatControlsRow
target.closest("#past-chats-toggle") !== pastChatsToggle &&
target.closest("#chat-controls-toggle") !== chatControlsToggle &&
target.closest(".header_bar") !== headerBar &&
target.closest("#past-chats-row") !== pastChatsRow &&
target.closest("#chat-controls") !== chatControlsRow
) {
handleIndividualSidebarClose(event);
}
@ -433,27 +420,19 @@ document.getElementById("chat-input-row").classList.add("chat-input-positioned")
//------------------------------------------------
const chatTextArea = document.getElementById("chat-input").querySelector("textarea");
function respondToChatInputVisibility(element, callback) {
var options = {
root: document.documentElement,
};
var observer = new IntersectionObserver((entries, observer) => {
function focusOnVisible(element) {
var observer = new IntersectionObserver((entries) => {
entries.forEach(entry => {
callback(entry.intersectionRatio > 0);
if (entry.intersectionRatio > 0) {
element.focus();
}
});
}, options);
}, { root: document.documentElement });
observer.observe(element);
}
function handleChatInputVisibilityChange(isVisible) {
if (isVisible) {
chatTextArea.focus();
}
}
respondToChatInputVisibility(chatTextArea, handleChatInputVisibilityChange);
focusOnVisible(chatTextArea);
//------------------------------------------------
// Show enlarged character picture when the profile
@ -463,8 +442,7 @@ let bigPictureVisible = false;
function addBigPicture() {
var imgElement = document.createElement("img");
var timestamp = new Date().getTime();
imgElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
imgElement.src = getProfilePictureUrl();
imgElement.classList.add("bigProfilePicture");
imgElement.addEventListener("load", function () {
this.style.visibility = "visible";
@ -478,9 +456,8 @@ function addBigPicture() {
}
function deleteBigPicture() {
var bigProfilePictures = document.querySelectorAll(".bigProfilePicture");
bigProfilePictures.forEach(function (element) {
element.parentNode.removeChild(element);
document.querySelectorAll(".bigProfilePicture").forEach(function (element) {
element.remove();
});
}
@ -494,44 +471,11 @@ function toggleBigPicture() {
}
}
//------------------------------------------------
// Handle the chat input box growth
//------------------------------------------------
// Cache DOM elements
const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
const chatInput = document.querySelector("#chat-input textarea");
// Variables to store current dimensions
let currentChatInputHeight = chatInput.clientHeight;
//------------------------------------------------
// Focus on the rename text area when it becomes visible
//------------------------------------------------
const renameTextArea = document.getElementById("rename-row").querySelector("textarea");
function respondToRenameVisibility(element, callback) {
var options = {
root: document.documentElement,
};
var observer = new IntersectionObserver((entries, observer) => {
entries.forEach(entry => {
callback(entry.intersectionRatio > 0);
});
}, options);
observer.observe(element);
}
function handleVisibilityChange(isVisible) {
if (isVisible) {
renameTextArea.focus();
}
}
respondToRenameVisibility(renameTextArea, handleVisibilityChange);
focusOnVisible(renameTextArea);
//------------------------------------------------
// Adjust the chat tab margin if no extension UI
@ -737,21 +681,21 @@ function handleIndividualSidebarClose(event) {
// Close navigation bar if click is outside and it is open
if (!headerBar.contains(target) && !headerBar.classList.contains("sidebar-hidden")) {
toggleSidebar(headerBar, navigationToggle, true);
toggleSidebar(headerBar, navigationToggle);
}
// Close past chats row if click is outside and it is open
if (!pastChatsRow.contains(target) && !pastChatsRow.classList.contains("sidebar-hidden")) {
toggleSidebar(pastChatsRow, pastChatsToggle, true);
toggleSidebar(pastChatsRow, pastChatsToggle);
}
// Close chat controls row if click is outside and it is open
if (!chatControlsRow.contains(target) && !chatControlsRow.classList.contains("sidebar-hidden")) {
toggleSidebar(chatControlsRow, chatControlsToggle, true);
toggleSidebar(chatControlsRow, chatControlsToggle);
}
}
function toggleSidebar(sidebar, toggle, forceClose = false) {
function toggleSidebar(sidebar, toggle) {
const isCurrentlyHidden = sidebar.classList.contains("sidebar-hidden");
const shouldClose = !isCurrentlyHidden;
@ -776,11 +720,6 @@ function toggleSidebar(sidebar, toggle, forceClose = false) {
toggle.classList.toggle("chat-controls-open", !shouldClose);
toggle.innerHTML = shouldClose ? leftArrowSVG : rightArrowSVG;
}
// Mobile handling
if (isMobile()) {
sidebar.classList.toggle("sidebar-shown", !shouldClose);
}
}
// Function to check if the device is mobile
@ -840,17 +779,17 @@ pastChatsToggle.addEventListener("click", () => {
const isCurrentlyOpen = !pastChatsRow.classList.contains("sidebar-hidden");
toggleSidebar(pastChatsRow, pastChatsToggle);
// On desktop, open/close both sidebars at the same time
// On desktop, sync both sidebars together
if (!isMobile()) {
if (isCurrentlyOpen) {
// If we just closed the left sidebar, also close the right sidebar
if (!chatControlsRow.classList.contains("sidebar-hidden")) {
toggleSidebar(chatControlsRow, chatControlsToggle, true);
toggleSidebar(chatControlsRow, chatControlsToggle);
}
} else {
// If we just opened the left sidebar, also open the right sidebar
if (chatControlsRow.classList.contains("sidebar-hidden")) {
toggleSidebar(chatControlsRow, chatControlsToggle, false);
toggleSidebar(chatControlsRow, chatControlsToggle);
}
}
}
@ -860,17 +799,17 @@ chatControlsToggle.addEventListener("click", () => {
const isCurrentlyOpen = !chatControlsRow.classList.contains("sidebar-hidden");
toggleSidebar(chatControlsRow, chatControlsToggle);
// On desktop, open/close both sidebars at the same time
// On desktop, sync both sidebars together
if (!isMobile()) {
if (isCurrentlyOpen) {
// If we just closed the right sidebar, also close the left sidebar
if (!pastChatsRow.classList.contains("sidebar-hidden")) {
toggleSidebar(pastChatsRow, pastChatsToggle, true);
toggleSidebar(pastChatsRow, pastChatsToggle);
}
} else {
// If we just opened the right sidebar, also open the left sidebar
if (pastChatsRow.classList.contains("sidebar-hidden")) {
toggleSidebar(pastChatsRow, pastChatsToggle, false);
toggleSidebar(pastChatsRow, pastChatsToggle);
}
}
}
@ -890,7 +829,7 @@ if (isMobile()) {
const textarea = document.querySelector("#chat-input textarea");
if (textarea) {
// Simulate adding and removing a newline
// Force textarea height recalculation by simulating content change
textarea.value += "\n";
textarea.dispatchEvent(new Event("input", { bubbles: true }));
textarea.value = textarea.value.slice(0, -1);

View file

@ -1,10 +1,9 @@
// Functions for downloading JSON files
function getCurrentTimestamp() {
const now = new Date();
const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert to milliseconds
const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert minutes to milliseconds
const localTime = new Date(now.getTime() - timezoneOffset);
const formattedTimestamp = localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
return formattedTimestamp;
return localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
}
function saveFile(contents, filename) {
@ -18,23 +17,18 @@ function saveFile(contents, filename) {
}
function saveHistory(history, character, mode) {
let path = null;
let path;
if (["chat", "chat-instruct"].includes(mode) && character && character.trim() !== "") {
path = `history_${character}_${getCurrentTimestamp()}.json`;
} else {
try {
path = `history_${mode}_${getCurrentTimestamp()}.json`;
} catch (error) {
path = `history_${getCurrentTimestamp()}.json`;
}
path = `history_${mode || "unknown"}_${getCurrentTimestamp()}.json`;
}
saveFile(history, path);
}
function saveSession(session) {
let path = null;
path = `session_${getCurrentTimestamp()}.json`;
const path = `session_${getCurrentTimestamp()}.json`;
saveFile(session, path);
}

View file

@ -1,13 +1,11 @@
const chatParent = document.querySelector(".chat-parent");
function toggle_controls(value) {
const navToggle = document.getElementById("navigation-toggle");
const pastChatsToggle = document.getElementById("past-chats-toggle");
const extensions = document.querySelector("#extensions");
const galleryExtension = document.getElementById("gallery-extension");
if (value) {
// SHOW MODE: Click toggles to show hidden sidebars
const navToggle = document.getElementById("navigation-toggle");
const pastChatsToggle = document.getElementById("past-chats-toggle");
if (navToggle && document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
navToggle.click();
}
@ -19,17 +17,11 @@ function toggle_controls(value) {
if (extensions) {
extensions.style.display = "inherit";
}
let gallery_element = document.getElementById("gallery-extension");
if (gallery_element) {
gallery_element.style.display = "block";
if (galleryExtension) {
galleryExtension.style.display = "block";
}
} else {
// HIDE MODE: Click toggles to hide visible sidebars
const navToggle = document.getElementById("navigation-toggle");
const pastChatsToggle = document.getElementById("past-chats-toggle");
if (navToggle && !document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
navToggle.click();
}
@ -41,5 +33,8 @@ function toggle_controls(value) {
if (extensions) {
extensions.style.display = "none";
}
if (galleryExtension) {
galleryExtension.style.display = "none";
}
}
}

View file

@ -2,17 +2,9 @@ function scrollToTop() {
window.scrollTo({ top: 0 });
}
function findButtonsByText(buttonText) {
const buttons = document.getElementsByTagName("button");
const matchingButtons = [];
for (let i = 0; i < buttons.length; i++) {
if (buttons[i].textContent.trim() === buttonText) {
matchingButtons.push(buttons[i]);
}
}
return matchingButtons;
function findButtonsByText(buttonText, container = document) {
return Array.from(container.getElementsByTagName("button"))
.filter(btn => btn.textContent.trim() === buttonText);
}
function switch_to_chat() {
@ -39,13 +31,9 @@ function switch_to_character() {
function switch_to_image_ai_generate() {
const container = document.querySelector("#image-ai-tab");
const buttons = container.getElementsByTagName("button");
for (let i = 0; i < buttons.length; i++) {
if (buttons[i].textContent.trim() === "Generate") {
buttons[i].click();
break;
}
const generateBtn = findButtonsByText("Generate", container)[0];
if (generateBtn) {
generateBtn.click();
}
scrollToTop();

View file

@ -1,7 +1,6 @@
function updateBigPicture() {
var existingElement = document.querySelector(".bigProfilePicture");
if (existingElement) {
var timestamp = new Date().getTime();
existingElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
existingElement.src = getProfilePictureUrl();
}
}

View file

@ -39,6 +39,146 @@ def load_chat_template_file(filepath):
return text
def _first_token_display_str(token_id, prompt, tokenizer):
"""Return the display string for the first prompt token.
Returns empty string for BOS or tokens that don't appear at the start
of the prompt text, so they don't shift text_offset for subsequent tokens.
"""
token_id = int(token_id)
bos_id = getattr(tokenizer, 'bos_token_id', None)
if bos_id is not None and token_id == bos_id:
return ""
import torch
tok = tokenizer.decode(torch.tensor([token_id]))
if not prompt.startswith(tok):
return ""
return tok
def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
"""Compute logprob entries for prompt tokens via a forward pass.
Returns a list of logprob entries in the standard format.
The first token gets a null entry (no conditioning context).
Supported for HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
via a single forward pass, and for llama.cpp via the server's
prompt_logprobs parameter. Returns [] for unsupported loaders.
"""
if input_ids is None:
input_ids = encode(prompt) # (1, seq_len) tensor or array
token_ids = input_ids[0]
n_tokens = len(token_ids)
if n_tokens == 0:
return []
loader = shared.args.loader
model = shared.model
if loader == 'llama.cpp':
return model.get_prompt_logprob_entries(token_ids, max(logprobs_count, 1), prompt=prompt)
first_token_str = _first_token_display_str(token_ids[0], prompt, shared.tokenizer)
if n_tokens <= 1:
return [{"token": first_token_str, "null_logprob": True}]
import torch
from modules.torch_utils import clear_torch_cache
if hasattr(model, 'get_prompt_logits'):
logits = model.get_prompt_logits(input_ids)
elif hasattr(model, 'forward'):
# HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
if hasattr(model, 'device'):
input_ids_tensor = input_ids_tensor.to(model.device)
with torch.no_grad():
# Pass labels to ensure logits are returned for ALL positions,
# not just the last token (some HF wrappers like ExLlamav3_HF
# only compute the last-token logits when labels are absent).
outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor)
logits = outputs.logits # keep on GPU, (1, seq_len, vocab) in model dtype
del outputs
else:
return []
entries = [{"token": first_token_str, "null_logprob": True}]
logprobs_count = max(logprobs_count, 1)
k = min(logprobs_count, logits.shape[-1])
chunk_size = 2048
unique_ids = set(int(tid) for tid in token_ids[1:])
# Process logits in chunks on GPU, only move top-K results to CPU
all_top_log_probs_list = []
all_top_indices_list = []
all_actual_lps = []
for start in range(0, n_tokens - 1, chunk_size):
end = min(start + chunk_size, n_tokens - 1)
chunk_logits = logits[0, start:end].float() # (chunk, vocab) on GPU
chunk_lse = torch.logsumexp(chunk_logits, dim=-1)
chunk_top_values, chunk_top_indices = torch.topk(chunk_logits, k=k, dim=-1)
chunk_top_log_probs = chunk_top_values - chunk_lse.unsqueeze(-1)
# Compute logprob for actual next tokens in this chunk
chunk_top_sets = [set(chunk_top_indices[j].tolist()) for j in range(end - start)]
for j in range(end - start):
actual_tid = int(token_ids[start + j + 1])
if actual_tid not in chunk_top_sets[j]:
all_actual_lps.append((chunk_logits[j, actual_tid] - chunk_lse[j]).item())
else:
all_actual_lps.append(None) # will use top_log_probs
all_top_log_probs_list.append(chunk_top_log_probs.cpu())
all_top_indices_list.append(chunk_top_indices.cpu())
unique_ids.update(int(tid) for tid in chunk_top_indices.flatten().tolist())
del chunk_logits, chunk_lse, chunk_top_values
del logits
clear_torch_cache()
all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0)
all_top_indices = torch.cat(all_top_indices_list, dim=0)
unique_ids_list = sorted(unique_ids)
decoded_list = shared.tokenizer.batch_decode([[tid] for tid in unique_ids_list]) if hasattr(shared.tokenizer, 'batch_decode') else [shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids_list]
decoded_strs = dict(zip(unique_ids_list, decoded_list))
for i in range(1, n_tokens):
token_id = int(token_ids[i])
idx = i - 1
top_log_probs = all_top_log_probs[idx]
top_ids = all_top_indices[idx].tolist()
actual_token_str = decoded_strs[token_id]
if token_id in top_ids:
actual_lp = top_log_probs[top_ids.index(token_id)].item()
alternatives = [
{"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
for j in range(k) if top_ids[j] != token_id
]
else:
actual_lp = all_actual_lps[idx]
alternatives = [
{"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
for j in range(k - 1)
]
entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives}
entries.append(entry)
return entries
def _get_raw_logprob_entries(offset=0):
"""Get raw logprob entries from llama.cpp/ExLlamav3 backend, starting from offset.
@ -65,6 +205,21 @@ def _parse_entry_top(entry):
return entry.get('top_logprobs', entry.get('top_probs', []))
def _extract_sampled_token(entry, top):
"""Get the actually sampled token and its logprob from a logprob entry.
Uses the entry-level token/logprob when available (the actually sampled
token), falling back to top[0] (highest-probability alternative) which
may differ with non-greedy sampling.
"""
if 'token' in entry:
return entry['token'], entry.get('logprob', entry.get('prob', 0))
token_str = top[0].get('token', '')
token_logprob = top[0].get('logprob', top[0].get('prob', 0))
return token_str, token_logprob
def format_chat_logprobs(entries):
"""Format logprob entries into OpenAI chat completions logprobs format.
@ -79,9 +234,7 @@ def format_chat_logprobs(entries):
if not top:
continue
chosen = top[0]
token_str = chosen.get('token', '')
token_logprob = chosen.get('logprob', chosen.get('prob', 0))
token_str, token_logprob = _extract_sampled_token(entry, top)
top_list = []
for item in top:
@ -106,7 +259,7 @@ def format_chat_logprobs(entries):
def format_completion_logprobs(entries):
"""Format logprob entries into OpenAI completions logprobs format.
Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "text_offset"}
Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "top_logprobs_ids": [{token_id: prob}], "text_offset"}
"""
if not entries:
return None
@ -114,17 +267,27 @@ def format_completion_logprobs(entries):
tokens = []
token_logprobs = []
top_logprobs = []
top_logprobs_ids = []
text_offset = []
offset = 0
for entry in entries:
# Handle null logprob entries (first prompt token with echo)
if entry.get("null_logprob"):
token_str = entry.get("token", "")
tokens.append(token_str)
token_logprobs.append(None)
top_logprobs.append(None)
top_logprobs_ids.append(None)
text_offset.append(offset)
offset += len(token_str)
continue
top = _parse_entry_top(entry)
if not top:
continue
chosen = top[0]
token_str = chosen.get('token', '')
token_logprob = chosen.get('logprob', chosen.get('prob', 0))
token_str, token_logprob = _extract_sampled_token(entry, top)
tokens.append(token_str)
token_logprobs.append(token_logprob)
@ -132,21 +295,29 @@ def format_completion_logprobs(entries):
offset += len(token_str)
top_dict = {}
top_dict_ids = {}
for item in top:
t = item.get('token', '')
lp = item.get('logprob', item.get('prob', 0))
top_dict[t] = lp
tid = item.get('token_id', item.get('id'))
if tid is not None:
top_dict_ids[tid] = lp
top_logprobs.append(top_dict)
top_logprobs_ids.append(top_dict_ids if top_dict_ids else None)
if not tokens:
return None
return {
result = {
"tokens": tokens,
"token_logprobs": token_logprobs,
"top_logprobs": top_logprobs,
"text_offset": text_offset
}
if any(x is not None for x in top_logprobs_ids):
result["top_logprobs_ids"] = top_logprobs_ids
return result
def process_parameters(body, is_legacy=False):
@ -407,7 +578,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
})
max_tokens = generate_params['max_new_tokens']
if max_tokens in [None, 0]:
if max_tokens is not None and max_tokens <= 0:
raise InvalidRequestError(message="max_tokens must be greater than 0.", param="max_tokens")
if max_tokens is None:
generate_params['max_new_tokens'] = 512
generate_params['auto_max_new_tokens'] = True
@ -652,6 +826,15 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
# common params
generate_params = process_parameters(body, is_legacy=is_legacy)
max_tokens = generate_params['max_new_tokens']
if max_tokens is None:
generate_params['max_new_tokens'] = 512
generate_params['auto_max_new_tokens'] = True
max_tokens = 512
elif max_tokens < 0:
raise InvalidRequestError(message="max_tokens must be greater than or equal to 0.", param="max_tokens")
elif max_tokens == 0 and body.get('logprobs') is None:
raise InvalidRequestError(message="max_tokens is 0 but no logprobs parameter was specified.", param="max_tokens")
generate_params['stream'] = stream
if stop_event is not None:
generate_params['stop_event'] = stop_event
@ -700,9 +883,17 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
prompt = decode(prompt)[0]
prefix = prompt if echo else ''
token_count = len(encode(prompt)[0])
prompt_input_ids = encode(prompt)
token_count = len(prompt_input_ids[0])
total_prompt_token_count += token_count
# Compute prompt logprobs once per prompt (shared across n_completions)
logprobs_val = body.get('logprobs', None)
if echo and logprobs_val is not None and logprobs_val >= 0:
prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
else:
prompt_entries = None
original_seed = generate_params.get('seed', -1)
for _n in range(n_completions):
# Increment seed for each completion to ensure diversity (matches llama.cpp native behavior)
@ -713,29 +904,41 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
logprob_proc.token_alternatives_history.clear()
# generate reply #######################################
debug_msg({'prompt': prompt, 'generate_params': generate_params})
generator = generate_reply(prompt, generate_params, is_chat=False)
answer = ''
for a in generator:
answer = a
completion_token_count = len(encode(answer)[0])
total_completion_token_count += completion_token_count
stop_reason = "stop"
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
stop_reason = "length"
if logprob_proc:
all_entries = []
for alt in logprob_proc.token_alternatives_history:
all_entries.extend(_dict_to_logprob_entries(alt))
completion_logprobs = format_completion_logprobs(all_entries)
elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
raw = getattr(shared.model, 'last_completion_probabilities', None)
completion_logprobs = format_completion_logprobs(raw)
if max_tokens == 0:
answer = ''
completion_token_count = 0
stop_reason = "stop"
else:
completion_logprobs = None
debug_msg({'prompt': prompt, 'generate_params': generate_params})
generator = generate_reply(prompt, generate_params, is_chat=False)
answer = ''
for a in generator:
answer = a
completion_token_count = len(encode(answer)[0])
stop_reason = "stop"
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
stop_reason = "length"
total_completion_token_count += completion_token_count
if max_tokens == 0:
all_entries = []
else:
if logprob_proc:
all_entries = []
for alt in logprob_proc.token_alternatives_history:
all_entries.extend(_dict_to_logprob_entries(alt))
elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
all_entries = getattr(shared.model, 'last_completion_probabilities', None) or []
else:
all_entries = []
if prompt_entries:
all_entries = prompt_entries + all_entries
completion_logprobs = format_completion_logprobs(all_entries) if all_entries else None
respi = {
"index": choice_index,
@ -775,7 +978,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
raise InvalidRequestError(message="API Batched generation not yet supported.", param=prompt_str)
prefix = prompt if echo else ''
token_count = len(encode(prompt)[0])
prompt_input_ids = encode(prompt)
token_count = len(prompt_input_ids[0])
# Check if usage should be included in streaming chunks per OpenAI spec
stream_options = body.get('stream_options')
@ -808,37 +1012,57 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
return chunk
logprobs_val = body.get('logprobs', None)
if echo and logprobs_val is not None and logprobs_val >= 0:
prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
prompt_logprobs_formatted = format_completion_logprobs(prompt_entries) if prompt_entries else None
else:
prompt_logprobs_formatted = None
# Clear stale logprobs from any previous request before building the
# first chunk, so text_streaming_chunk doesn't pick up old data.
if hasattr(shared.model, 'last_completion_probabilities'):
shared.model.last_completion_probabilities = []
cmpl_logprobs_offset[0] = 0
chunk = text_streaming_chunk(prefix)
if prompt_logprobs_formatted is not None:
chunk[resp_list][0]["logprobs"] = prompt_logprobs_formatted
if include_usage:
chunk['usage'] = None
yield chunk
# generate reply #######################################
debug_msg({'prompt': prompt, 'generate_params': generate_params})
generator = generate_reply(prompt, generate_params, is_chat=False)
answer = ''
seen_content = ''
completion_token_count = 0
if max_tokens == 0:
answer = ''
completion_token_count = 0
stop_reason = "stop"
else:
debug_msg({'prompt': prompt, 'generate_params': generate_params})
generator = generate_reply(prompt, generate_params, is_chat=False)
answer = ''
seen_content = ''
completion_token_count = 0
for a in generator:
answer = a
for a in generator:
answer = a
len_seen = len(seen_content)
new_content = answer[len_seen:]
len_seen = len(seen_content)
new_content = answer[len_seen:]
if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
continue
if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
continue
seen_content = answer
chunk = text_streaming_chunk(new_content)
if include_usage:
chunk['usage'] = None
yield chunk
seen_content = answer
chunk = text_streaming_chunk(new_content)
if include_usage:
chunk['usage'] = None
yield chunk
completion_token_count = len(encode(answer)[0])
stop_reason = "stop"
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
stop_reason = "length"
completion_token_count = len(encode(answer)[0])
stop_reason = "stop"
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
stop_reason = "length"
chunk = text_streaming_chunk(suffix)
chunk[resp_list][0]["finish_reason"] = stop_reason

View file

@ -68,7 +68,7 @@ def _load_model(data):
if k in shared.settings:
shared.settings[k] = settings[k]
if k == 'truncation_length':
logger.info(f"TRUNCATION LENGTH (UPDATED): {shared.settings['truncation_length']}")
logger.info(f"CONTEXT LENGTH (UPDATED): {shared.settings['truncation_length']}")
elif k == 'instruction_template':
logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}")

View file

@ -671,7 +671,10 @@ def get_stopping_strings(state):
# Handle GPT-OSS as a special case
if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
result.remove("<|end|>")
result.append("<|result|>")
if '<|result|>' in state['instruction_template_str']:
result.append("<|result|>")
elif '<|return|>' in state['instruction_template_str']:
result.append("<|return|>")
result = list(set(result))
if shared.args.verbose:

View file

@ -423,6 +423,15 @@ class Exllamav3Model:
if logit_bias:
filters.append(LogitBiasFilter(self.tokenizer, logit_bias))
# Suppress EOS tokens via logit bias so they are never sampled
if state['ban_eos_token']:
eos_bias = {}
for eos_id in self.config.eos_token_id_list:
if eos_id is not None:
eos_bias[str(eos_id)] = float('-inf')
if eos_bias:
filters.append(LogitBiasFilter(self.tokenizer, eos_bias))
# Logprobs support (OpenAI API)
logprobs = state.get('logprobs', 0) or 0
return_top_tokens = logprobs if logprobs > 0 else 0
@ -480,15 +489,35 @@ class Exllamav3Model:
return
id_to_piece = self.tokenizer.get_id_to_piece_list(True)
sampled_ids = result.get("token_ids") # (batch, seq_len) - actually sampled tokens
sampled_probs = result.get("token_probs") # (batch, seq_len) - their probabilities
def _piece(tid):
s = id_to_piece[tid] if tid < len(id_to_piece) else f"<{tid}>"
return s.replace('\u2581', ' ')
def _logprob(prob):
return math.log(prob) if prob > 0 else float("-inf")
# top_k_tokens shape: (batch, seq_len, k), top_k_probs same
for seq_idx in range(top_k_tokens.shape[1]):
entry = {"top_logprobs": []}
for k_idx in range(top_k_tokens.shape[2]):
token_id = top_k_tokens[0, seq_idx, k_idx].item()
prob = top_k_probs[0, seq_idx, k_idx].item()
token_str = id_to_piece[token_id] if token_id < len(id_to_piece) else f"<{token_id}>"
logprob = math.log(prob) if prob > 0 else float("-inf")
entry["top_logprobs"].append({"token": token_str, "logprob": logprob})
entry["top_logprobs"].append({"token": _piece(token_id), "logprob": _logprob(prob)})
# Record the actually sampled token at the entry level so
# format_completion_logprobs uses it instead of top_logprobs[0]
# (they differ with non-greedy sampling).
if sampled_ids is not None:
sid = sampled_ids[0, seq_idx].item()
entry["token"] = _piece(sid)
if sampled_probs is not None:
entry["logprob"] = _logprob(sampled_probs[0, seq_idx].item())
else:
entry["logprob"] = None
self.last_completion_probabilities.append(entry)
def generate(self, prompt, state):
@ -498,42 +527,31 @@ class Exllamav3Model:
return output
def get_prompt_logits(self, input_ids):
"""Return logits for all positions via a single no-cache forward pass.
Used by prompt logprobs computation. Returns (1, seq_len, vocab) on CPU in float32.
"""
import torch
input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
input_ids_tensor = input_ids_tensor.view(1, -1).cpu()
with torch.no_grad():
return self.model.forward(
input_ids=input_ids_tensor,
params={"attn_mode": "flash_attn_nc"}
).cpu().float()
def get_logits(self, token_ids, **kwargs):
"""
Process a batch of token_ids and return the logits for the last token.
This will reset and overwrite the model's cache.
Uses flash_attn_nc (no cache) for correct results with recurrent models.
"""
# Initialize a single params dictionary that will be updated in-place
params = {
"cache": self.cache,
"reconstruct": False,
"attn_mode": "flash_attn",
"batch_shape": (1, self.max_tokens),
"past_len": 0
}
params.update(kwargs)
# Process prefix tokens to fill the cache and generate recurrent state
if token_ids.shape[-1] > 1:
prefix_ids = token_ids[:, :-1]
# This forward call updates the 'params' dict with the recurrent state
self.model.forward(
input_ids=prefix_ids,
params=params
)
# Update past_len for the next call
params["past_len"] = prefix_ids.shape[-1]
# Process the last token, now using the state-filled 'params' dict
last_token_ids = token_ids[:, -1:]
logits = self.model.forward(
input_ids=last_token_ids,
params=params
input_ids=token_ids,
params={"attn_mode": "flash_attn_nc"}
)
return logits.float().cpu()
return logits[:, -1:, :].float().cpu()
def encode(self, string, **kwargs):
add_bos = kwargs.pop('add_bos', True)

View file

@ -26,6 +26,9 @@ except Exception:
class Exllamav3HF(PreTrainedModel, GenerationMixin):
def __init__(self, model_dir):
hf_config = PretrainedConfig.from_pretrained(model_dir)
# Ensure text_config is a proper object, not a dict (fixes qwen3_5_moe + transformers compat)
if isinstance(getattr(hf_config, 'text_config', None), dict):
hf_config.text_config = PretrainedConfig(**hf_config.text_config)
super().__init__(hf_config)
exl3_config = Config.from_directory(model_dir)
@ -199,30 +202,11 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
}
).to(input_ids.device).float()
else:
# Labels path: use cache for cross-chunk attention.
tokens_to_process = seq_tensor
all_logits = None
current_len = 0
for i in range(0, tokens_to_process.shape[0], max_chunk_size):
chunk = tokens_to_process[i:i + max_chunk_size]
chunk_logits = self.ex_model.forward(
input_ids=chunk.view(1, -1),
params={
"attn_mode": "flash_attn",
"cache": ex_cache,
"past_len": current_len,
"batch_shape": (1, self.max_tokens),
}
).float()
current_len += chunk.shape[0]
if all_logits is None:
all_logits = chunk_logits
else:
all_logits = torch.cat([all_logits, chunk_logits], dim=1)
logits = all_logits
# Labels path: single pass without cache for correct logits
logits = self.ex_model.forward(
input_ids=seq_tensor.view(1, -1),
params={"attn_mode": "flash_attn_nc"}
).float().cpu()
if is_negative:
self.past_seq_negative = seq_tensor

View file

@ -191,21 +191,19 @@ def _apply_custom_generate_reply():
def _apply_custom_css():
all_css = ''
for extension, _ in iterator():
if hasattr(extension, 'custom_css'):
all_css += getattr(extension, 'custom_css')()
return all_css
return ''.join(
getattr(extension, 'custom_css')()
for extension, _ in iterator()
if hasattr(extension, 'custom_css')
)
def _apply_custom_js():
all_js = ''
for extension, _ in iterator():
if hasattr(extension, 'custom_js'):
all_js += getattr(extension, 'custom_js')()
return all_js
return ''.join(
getattr(extension, 'custom_js')()
for extension, _ in iterator()
if hasattr(extension, 'custom_js')
)
def create_extensions_block():

View file

@ -11,7 +11,6 @@ import time
from pathlib import Path
from typing import Any, List
import llama_cpp_binaries
import requests
from modules import shared
@ -311,8 +310,45 @@ class LlamaServer:
else:
raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
def get_prompt_logprob_entries(self, token_ids, n_probs=5, prompt=""):
"""Get logprob entries for prompt tokens via a single n_predict=0 request.
Requires llama.cpp server with prompt_logprobs support.
Returns entries in the standard format for format_completion_logprobs().
"""
token_ids_list = token_ids.tolist() if hasattr(token_ids, 'tolist') else list(token_ids)
url = f"http://127.0.0.1:{self.port}/completion"
payload = {
"prompt": token_ids_list,
"n_predict": 0,
"n_probs": n_probs,
"prompt_logprobs": True,
"stream": False,
"cache_prompt": False,
}
response = self.session.post(url, json=payload)
result = response.json()
prompt_probs = result.get("prompt_probabilities", [])
if not prompt_probs:
return []
# Null first token (no conditioning context); use empty string for BOS
# or tokens that don't appear at the start of the prompt text.
first_token_str = self.decode([token_ids_list[0]])
if self.bos_token and first_token_str == self.bos_token:
first_token_str = ""
elif not prompt.startswith(first_token_str):
first_token_str = ""
entries = [{"token": first_token_str, "null_logprob": True}]
entries.extend(prompt_probs)
return entries
def _get_vocabulary_size(self):
"""Get and store the model's maximum context length."""
"""Get and store the model's vocabulary size."""
url = f"http://127.0.0.1:{self.port}/v1/models"
response = self.session.get(url).json()
@ -357,7 +393,16 @@ class LlamaServer:
"""Start the llama.cpp server and wait until it's ready."""
# Determine the server path
if self.server_path is None:
self.server_path = llama_cpp_binaries.get_binary_path()
if shared.args.ik:
try:
import ik_llama_cpp_binaries
except ImportError:
raise ImportError("--ik requires the ik_llama_cpp_binaries package. Install it with: pip install <ik_llama_cpp_binaries wheel URL>")
self.server_path = ik_llama_cpp_binaries.get_binary_path()
else:
import llama_cpp_binaries
self.server_path = llama_cpp_binaries.get_binary_path()
# Build the command
cmd = [
@ -470,6 +515,10 @@ class LlamaServer:
else:
cmd.append(f"--{flag_item}")
# Patch flags for ik_llama.cpp compatibility
if shared.args.ik:
cmd = _patch_cmd_for_ik(cmd)
env = os.environ.copy()
if os.name == 'posix':
current_path = env.get('LD_LIBRARY_PATH', '')
@ -607,3 +656,49 @@ def filter_stderr_with_progress(process_stderr):
process_stderr.close()
except Exception:
pass
def _patch_cmd_for_ik(cmd):
"""
Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
--no-webui --webui none
--fit off (removed)
--fit on / --fit-ctx --fit (bare flag)
--fit-target --fit-margin
--cache-reuse (removed, unsupported)
--swa-full (removed, unsupported)
"""
# Add Hadamard KV cache rotation when using quantized cache types.
# This significantly improves quantized cache quality (especially q4_0)
# and is a no-op for MLA models like DeepSeek.
if shared.args.cache_type in ("q8_0", "q4_0"):
cmd += ["-khad", "-vhad"]
patched = []
i = 0
while i < len(cmd):
arg = cmd[i]
if arg == "--no-webui":
patched += ["--webui", "none"]
elif arg == "--fit" and i + 1 < len(cmd) and cmd[i + 1] in ("on", "off"):
val = cmd[i + 1]
i += 1
if val == "on":
patched.append("--fit")
# "off" → drop entirely
elif arg == "--fit-ctx":
patched.append("--fit")
i += 1 # skip the value
elif arg == "--fit-target":
patched.append("--fit-margin")
elif arg == "--cache-reuse":
i += 1 # skip the value
elif arg == "--swa-full":
pass # bare flag, just drop it
else:
patched.append(arg)
i += 1
return patched

View file

@ -20,6 +20,7 @@ loaders_and_params = OrderedDict({
'no_mmap',
'mlock',
'numa',
'ik',
'parallel',
'model_draft',
'draft_max',
@ -345,6 +346,7 @@ def list_model_elements():
'spec_ngram_size_m',
'spec_ngram_min_hits',
'mmproj',
'ik',
]

View file

@ -4,7 +4,6 @@ import numpy as np
from modules import models, shared
from modules.logging_colors import logger
from modules.models import load_model
from modules.text_generation import generate_reply
from modules.utils import check_model_loaded
@ -12,8 +11,7 @@ global_scores = None
def get_next_logits(*args, **kwargs):
if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
shared.model, shared.tokenizer = load_model(shared.model_name)
models.load_model_if_idle_unloaded()
needs_lock = not args[2] # use_samplers
if needs_lock:

View file

@ -1,4 +1,5 @@
import sys
import threading
import time
import modules.shared as shared
@ -7,6 +8,15 @@ from modules.models_settings import get_model_metadata
from modules.utils import resolve_model_path
last_generation_time = time.time()
active_generation_count = 0
_generation_count_lock = threading.Lock()
def load_model_if_idle_unloaded():
global last_generation_time
if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
shared.model, shared.tokenizer = load_model(shared.model_name)
last_generation_time = time.time()
def load_model(model_name, loader=None):
@ -66,8 +76,7 @@ def load_model(model_name, loader=None):
logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
logger.info(f"LOADER: \"{loader}\"")
logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"")
logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}")
return model, tokenizer
@ -159,7 +168,10 @@ def unload_model_if_idle():
while True:
shared.generation_lock.acquire()
try:
if time.time() - last_generation_time > shared.args.idle_timeout * 60:
with _generation_count_lock:
is_active = active_generation_count > 0
if not is_active and time.time() - last_generation_time > shared.args.idle_timeout * 60:
if shared.model is not None:
logger.info("Unloading the model for inactivity.")
unload_model(keep_model_name=True)

View file

@ -23,14 +23,9 @@ def get_fallback_settings():
def get_model_metadata(model):
model_path = resolve_model_path(model)
model_settings = {}
# Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
settings = shared.model_config
for pat in settings:
if re.match(pat.lower(), Path(model).name.lower()):
for k in settings[pat]:
model_settings[k] = settings[pat][k]
# Fallback settings
model_settings = get_fallback_settings()
path = model_path / 'config.json'
if path.exists():

View file

@ -110,6 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. Requires the ik_llama_cpp_binaries package to be installed.')
# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
@ -454,17 +455,7 @@ def load_user_config():
args.loader = fix_loader_name(args.loader)
# Load model-specific settings
p = Path(f'{args.model_dir}/config.yaml')
if p.exists():
model_config = yaml.safe_load(open(p, 'r').read())
else:
model_config = {}
del p
# Load custom model-specific settings
user_config = load_user_config()
model_config = OrderedDict(model_config)
user_config = OrderedDict(user_config)

View file

@ -17,9 +17,7 @@ from modules.utils import check_model_loaded
def generate_reply(*args, **kwargs):
if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
from modules.models import load_model
shared.model, shared.tokenizer = load_model(shared.model_name)
models.load_model_if_idle_unloaded()
state = args[1] if len(args) > 1 else kwargs.get('state', {})
use_parallel = (
@ -31,10 +29,16 @@ def generate_reply(*args, **kwargs):
if not use_parallel:
shared.generation_lock.acquire()
with models._generation_count_lock:
models.active_generation_count += 1
try:
for result in _generate_reply(*args, **kwargs):
yield result
finally:
with models._generation_count_lock:
models.active_generation_count -= 1
models.last_generation_time = time.time()
if not use_parallel:
shared.generation_lock.release()
@ -126,7 +130,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')
models.load_model_if_idle_unloaded()
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')
# llama.cpp case
if shared.model.__class__.__name__ == 'LlamaServer':
@ -176,7 +182,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
def decode(output_ids, skip_special_tokens=True):
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')
models.load_model_if_idle_unloaded()
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')
return shared.tokenizer.decode(output_ids, skip_special_tokens=skip_special_tokens)

View file

@ -109,7 +109,6 @@ def load_model_HF(model_name):
params = {
'low_cpu_mem_usage': True,
'attn_implementation': shared.args.attn_implementation,
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
}
if shared.original_args.trust_remote_code:
@ -120,6 +119,17 @@ def load_model_HF(model_name):
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.original_args.trust_remote_code)
# Determine torch_dtype: respect --bf16 flag, otherwise autodetect
# from model config, but never allow float32.
if shared.args.bf16:
params['torch_dtype'] = torch.bfloat16
else:
dtype = getattr(config, 'torch_dtype', None) or getattr(getattr(config, 'text_config', None), 'torch_dtype', None)
if dtype in (torch.float16, torch.bfloat16):
params['torch_dtype'] = dtype
else:
params['torch_dtype'] = torch.float16
if 'chatglm' in model_name.lower():
LoaderClass = AutoModel
else:

View file

@ -82,7 +82,7 @@ def create_ui():
gr.HTML("<div class='sidebar-vertical-separator'></div>")
shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='For models with thinking support.')
gr.HTML("<div class='sidebar-vertical-separator'></div>")

View file

@ -51,6 +51,9 @@ def create_ui():
with gr.Column():
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
if not shared.args.portable:
shared.gradio['ik'] = gr.Checkbox(label="ik", value=shared.args.ik, info='Use ik_llama.cpp instead of upstream llama.cpp.')
shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)

View file

@ -1,4 +1,4 @@
accelerate==1.12.*
accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
bitsandbytes==0.49.*
datasets
@ -25,14 +25,14 @@ sentencepiece
tensorboard
torchao==0.15.*
trafilatura==2.0.0
transformers==5.3.*
transformers==5.5.*
triton-windows==3.5.1.post24; platform_system == "Windows"
tqdm
wandb
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -40,9 +40,11 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"

View file

@ -1,4 +1,4 @@
accelerate==1.12.*
accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
datasets
diffusers==0.37.*
@ -22,14 +22,14 @@ scipy
sentencepiece
tensorboard
torchao==0.15.*
transformers==5.3.*
transformers==5.5.*
tqdm
trafilatura==2.0.0
wandb
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -37,5 +37,5 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,4 +1,4 @@
accelerate==1.12.*
accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
datasets
diffusers==0.37.*
@ -22,14 +22,14 @@ scipy
sentencepiece
tensorboard
torchao==0.15.*
transformers==5.3.*
transformers==5.5.*
tqdm
trafilatura==2.0.0
wandb
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -37,4 +37,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"

View file

@ -1,4 +1,4 @@
accelerate==1.12.*
accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
datasets
diffusers==0.37.*
@ -22,14 +22,14 @@ scipy
sentencepiece
tensorboard
torchao==0.15.*
transformers==5.3.*
transformers==5.5.*
tqdm
trafilatura==2.0.0
wandb
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -37,4 +37,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"

View file

@ -1,4 +1,4 @@
accelerate==1.12.*
accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
datasets
diffusers==0.37.*
@ -22,14 +22,14 @@ scipy
sentencepiece
tensorboard
torchao==0.15.*
transformers==5.3.*
transformers==5.5.*
tqdm
trafilatura==2.0.0
wandb
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -37,5 +37,7 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -1,4 +1,4 @@
accelerate==1.12.*
accelerate==1.13.*
audioop-lts<1.0; python_version >= "3.13"
datasets
diffusers==0.37.*
@ -22,14 +22,14 @@ scipy
sentencepiece
tensorboard
torchao==0.15.*
transformers==5.3.*
transformers==5.5.*
tqdm
trafilatura==2.0.0
wandb
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15

View file

@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -23,4 +23,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"

View file

@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -23,4 +23,4 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"

View file

@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -0,0 +1,27 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
pymupdf==1.27.*
python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -0,0 +1,27 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
pymupdf==1.27.*
python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
sse-starlette==1.6.5
tiktoken
# ik_llama.cpp (CPU only)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -0,0 +1,27 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
pymupdf==1.27.*
python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15

View file

@ -14,8 +14,8 @@ trafilatura==2.0.0
tqdm
# Gradio
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
# API
flask_cloudflared==0.0.15
@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# Vulkan wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -18,7 +18,6 @@ import modules.extensions as extensions_module
from modules.LoRA import add_lora_to_model
from modules.models import load_model, unload_model_if_idle
from modules.models_settings import (
get_fallback_settings,
get_model_metadata,
update_model_parameters
)
@ -271,10 +270,6 @@ if __name__ == "__main__":
# Apply CLI overrides for image model settings (CLI flags take precedence over saved settings)
shared.apply_image_model_cli_overrides()
# Fallback settings for models
shared.model_config['.*'] = get_fallback_settings()
shared.model_config.move_to_end('.*', last=False) # Move to the beginning
# Activate the extensions listed on settings.yaml
extensions_module.available_extensions = utils.get_available_extensions()
for extension in shared.settings['default_extensions']:

View file

@ -1,203 +0,0 @@
.*(llama|alpac|vicuna|guanaco|koala|llava|wizardlm|metharme|pygmalion-7b|pygmalion-2|mythalion|wizard-mega|openbuddy|vigogne|h2ogpt-research|manticore):
model_type: 'llama'
.*(opt-|opt_|opt1|opt3|optfor|galactica|galpaca|pygmalion-350m):
model_type: 'opt'
.*(gpt-j|gptj|gpt4all-j|malion-6b|pygway|pygmalion-6b|dolly-v1):
model_type: 'gptj'
.*(gpt-neox|koalpaca-polyglot|polyglot.*koalpaca|polyglot-ko|polyglot_ko|pythia|stablelm|incite|dolly-v2|polycoder|h2ogpt-oig|h2ogpt-oasst1|h2ogpt-gm):
model_type: 'gptneox'
.*bloom:
model_type: 'bloom'
.*gpt2:
model_type: 'gpt2'
.*falcon:
model_type: 'falcon'
.*mpt:
model_type: 'mpt'
.*(starcoder|starchat):
model_type: 'starcoder'
.*dolly-v2:
model_type: 'dollyv2'
.*replit:
model_type: 'replit'
.*(oasst|openassistant-|stablelm-7b-sft-v7-epoch-3):
instruction_template: 'Open Assistant'
skip_special_tokens: false
(?!.*galactica)(?!.*reward).*openassistant:
instruction_template: 'Open Assistant'
skip_special_tokens: false
.*galactica:
skip_special_tokens: false
.*dolly-v[0-9]-[0-9]*b:
instruction_template: 'Alpaca'
skip_special_tokens: false
.*alpaca-native-4bit:
instruction_template: 'Alpaca'
.*llava:
instruction_template: 'LLaVA'
.*llava.*1.5:
instruction_template: 'Vicuna-v1.1'
.*wizard.*mega:
instruction_template: 'Wizard-Mega'
.*starchat-beta:
instruction_template: 'Starchat-Beta'
(?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna:
instruction_template: 'Vicuna-v0'
.*vicuna.*v0:
instruction_template: 'Vicuna-v0'
.*vicuna.*(1.1|1_1|1.3|1_3):
instruction_template: 'Vicuna-v1.1'
.*vicuna.*(1.5|1_5):
instruction_template: 'Vicuna-v1.1'
.*stable.*vicuna:
instruction_template: 'StableVicuna'
(?!.*chat).*chinese-vicuna:
instruction_template: 'Alpaca'
.*chinese-vicuna.*chat:
instruction_template: 'Chinese-Vicuna-Chat'
.*alpaca:
instruction_template: 'Alpaca'
.*koala:
instruction_template: 'Koala'
.*chatglm:
instruction_template: 'ChatGLM'
.*(metharme|pygmalion|mythalion):
instruction_template: 'Metharme'
.*raven:
instruction_template: 'RWKV-Raven'
.*moss-moon.*sft:
instruction_template: 'MOSS'
.*stablelm-tuned:
instruction_template: 'StableLM'
.*galactica.*finetuned:
instruction_template: 'Galactica Finetuned'
.*galactica.*-v2:
instruction_template: 'Galactica v2'
(?!.*finetuned)(?!.*-v2).*galactica:
instruction_template: 'Galactica'
.*guanaco:
instruction_template: 'Guanaco non-chat'
.*baize:
instruction_template: 'Baize'
.*mpt-.*instruct:
instruction_template: 'Alpaca'
.*mpt-.*chat:
instruction_template: 'ChatML'
(?!.*-flan-)(?!.*-t5-).*lamini-:
instruction_template: 'Alpaca'
.*incite.*chat:
instruction_template: 'INCITE-Chat'
.*incite.*instruct:
instruction_template: 'INCITE-Instruct'
.*ziya-:
instruction_template: 'Ziya'
.*koalpaca:
instruction_template: 'KoAlpaca'
.*openbuddy:
instruction_template: 'OpenBuddy'
(?!.*chat).*vigogne:
instruction_template: 'Vigogne-Instruct'
.*vigogne.*chat:
instruction_template: 'Vigogne-Chat'
.*(llama-deus|supercot|llama-natural-instructions|open-llama-0.3t-7b-instruct-dolly-hhrlhf|open-llama-0.3t-7b-open-instruct):
instruction_template: 'Alpaca'
.*bactrian:
instruction_template: 'Bactrian'
.*(h2ogpt-oig-|h2ogpt-oasst1-|h2ogpt-research-oasst1-):
instruction_template: 'INCITE-Chat'
.*h2ogpt-gm-:
instruction_template: 'H2O-prompt_answer'
.*manticore:
instruction_template: 'Manticore Chat'
.*bluemoonrp-(30|13)b:
instruction_template: 'Bluemoon'
.*Nous-Hermes-13b:
instruction_template: 'Alpaca'
.*airoboros:
instruction_template: 'Vicuna-v1.1'
.*airoboros.*1.2:
instruction_template: 'Airoboros-v1.2'
.*alpa(cino|sta):
instruction_template: 'Alpaca'
.*hippogriff:
instruction_template: 'Hippogriff'
.*lazarus:
instruction_template: 'Alpaca'
.*guanaco-.*(7|13|33|65)b:
instruction_template: 'Vicuna-v0'
.*hypermantis:
instruction_template: 'Alpaca'
.*open-llama-.*-open-instruct:
instruction_template: 'Alpaca'
.*starcoder-gpteacher-code-instruct:
instruction_template: 'Alpaca'
.*tulu:
instruction_template: 'Tulu'
.*chronos:
instruction_template: 'Alpaca'
.*samantha:
instruction_template: 'Samantha'
.*wizardcoder:
instruction_template: 'Alpaca'
.*minotaur:
instruction_template: 'Manticore Chat'
.*orca_mini:
instruction_template: 'Orca Mini'
.*(platypus|gplatty|superplatty):
instruction_template: 'Alpaca'
.*(openorca-platypus2):
instruction_template: 'OpenOrca-Platypus2'
.*longchat:
instruction_template: 'Vicuna-v1.1'
.*vicuna-33b:
instruction_template: 'Vicuna-v1.1'
.*redmond-hermes-coder:
instruction_template: 'Alpaca'
.*wizardcoder-15b:
instruction_template: 'Alpaca'
.*wizardlm:
instruction_template: 'Vicuna-v1.1'
.*godzilla:
instruction_template: 'Alpaca'
.*llama(-?)(2|v2).*chat:
instruction_template: 'Llama-v2'
.*newhope:
instruction_template: 'NewHope'
.*stablebeluga2:
instruction_template: 'StableBeluga2'
.*openchat:
instruction_template: 'OpenChat'
.*codellama.*instruct:
instruction_template: 'Llama-v2'
.*(mistral|mixtral).*instruct:
instruction_template: 'Mistral'
.*mistral.*openorca:
instruction_template: 'ChatML'
.*(WizardCoder-Python-34B-V1.0|Phind-CodeLlama-34B-v2|CodeBooga-34B-v0.1):
instruction_template: 'Alpaca'
.*orca-2-(13|7)b:
instruction_template: 'ChatML'
.*openhermes.*mistral:
instruction_template: 'ChatML'
.*Yi-34B-Chat:
instruction_template: 'ChatML'
(dolphin).*:
instruction_template: 'ChatML'
.*synthia:
instruction_template: 'Synthia'
.*(hercules|hyperion):
instruction_template: 'ChatML'
.*command-r:
instruction_template: 'Command-R'
.*xwin-lm-70b-v0.1:
instruction_template: 'Vicuna-v1.1'
.*platypus-yi-34b:
instruction_template: 'Vicuna-v1.1'
.*CausalLM-RP-34B:
instruction_template: 'ChatML'
34b-beta:
instruction_template: 'ChatML'
.*airoboros-3_1-yi-34b-200k:
instruction_template: 'Llama-v2'
.*chatqa:
instruction_template: 'NVIDIA-ChatQA'