mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-01-13 12:10:03 +01:00
commit
d3a7710c62
|
|
@ -1,4 +1,4 @@
|
|||
# Text generation web UI
|
||||
# Text Generation Web UI
|
||||
|
||||
A Gradio web UI for Large Language Models.
|
||||
|
||||
|
|
@ -238,7 +238,7 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [-
|
|||
[--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY]
|
||||
[--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
|
||||
|
||||
Text generation web UI
|
||||
Text Generation Web UI
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
|
|
|
|||
|
|
@ -99,9 +99,11 @@
|
|||
.message-body p em {
|
||||
color: rgb(110 110 110) !important;
|
||||
}
|
||||
|
||||
.editing-textarea {
|
||||
width: max(30rem) !important;
|
||||
}
|
||||
|
||||
.circle-you + .text .edit-control-button, .circle-you + .text .editing-textarea {
|
||||
color: #000 !important;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,7 +13,9 @@
|
|||
line-height: 28px !important;
|
||||
}
|
||||
|
||||
.dark .chat .message-body :is(p, li, q, em, h1, h2, h3, h4, h5, h6) {
|
||||
.dark .chat .message-body :is(p,li,h1,h2,h3,h4,h5,h6),
|
||||
.dark .chat .message-body em:not(:is(h1,h2,h3,h4,h5,h6,b,strong) em),
|
||||
.dark .chat .message-body q:not(:is(h1,h2,h3,h4,h5,h6,b,strong) q) {
|
||||
color: #d1d5db !important;
|
||||
}
|
||||
|
||||
|
|
|
|||
26
css/main.css
26
css/main.css
|
|
@ -404,6 +404,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
|||
flex: 1;
|
||||
overflow: auto !important;
|
||||
border-radius: 0 !important;
|
||||
margin-bottom: 75px;
|
||||
}
|
||||
|
||||
.chat-parent .prose {
|
||||
|
|
@ -428,10 +429,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
|||
margin-left: 5px;
|
||||
}
|
||||
|
||||
.chat-parent.bigchat {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.chat > .messages {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
|
|
@ -626,6 +623,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
|||
max-width: 54rem;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
position: absolute;
|
||||
bottom: 0;
|
||||
background: var(--body-background-fill);
|
||||
}
|
||||
|
||||
@media print {
|
||||
|
|
@ -828,10 +828,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
|||
padding: 1rem;
|
||||
}
|
||||
|
||||
#chat-input-row.bigchat {
|
||||
padding-bottom: 1px !important;
|
||||
}
|
||||
|
||||
#chat-col {
|
||||
height: 100dvh;
|
||||
display: flex;
|
||||
|
|
@ -847,10 +843,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
|||
}
|
||||
}
|
||||
|
||||
#chat-col.bigchat {
|
||||
padding-bottom: 15px !important;
|
||||
}
|
||||
|
||||
.message-body ol, .message-body ul {
|
||||
margin-top: 0 !important;
|
||||
margin-bottom: 1.25em !important;
|
||||
|
|
@ -1362,6 +1354,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
|||
cursor: pointer;
|
||||
user-select: none;
|
||||
font-size: 14px;
|
||||
line-height: var(--line-sm);
|
||||
color: rgb(0 0 0 / 70%);
|
||||
transition: background-color 0.2s;
|
||||
}
|
||||
|
|
@ -1693,3 +1686,12 @@ button:focus {
|
|||
#chat-input span {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.sidebar-vertical-separator {
|
||||
margin: 0;
|
||||
border-bottom: var(--input-border-width) solid var(--input-border-color);
|
||||
}
|
||||
|
||||
.dark .sidebar-vertical-separator {
|
||||
border-bottom: 1px solid rgb(255 255 255 / 10%);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -372,3 +372,18 @@ observer.observe(document.documentElement, {
|
|||
subtree: true,
|
||||
attributeFilter: ["style"]
|
||||
});
|
||||
|
||||
//------------------------------------------------
|
||||
// Suppress "Attempted to select a non-interactive or hidden tab" warning
|
||||
//------------------------------------------------
|
||||
(function() {
|
||||
const originalWarn = console.warn;
|
||||
|
||||
console.warn = function(...args) {
|
||||
if (args[0] && typeof args[0] === "string" && args[0].includes("Attempted to select a non-interactive or hidden tab")) {
|
||||
return;
|
||||
}
|
||||
|
||||
originalWarn.apply(console, args);
|
||||
};
|
||||
})();
|
||||
|
|
|
|||
55
js/main.js
55
js/main.js
|
|
@ -206,7 +206,13 @@ const observer = new MutationObserver(function(mutations) {
|
|||
// Add padding to the messages container to create room for the last message.
|
||||
// The purpose of this is to avoid constant scrolling during streaming in
|
||||
// instruct mode.
|
||||
const bufferHeight = Math.max(0, Math.max(0.7 * window.innerHeight, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight);
|
||||
let bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight);
|
||||
|
||||
// Subtract header height when screen width is <= 924px
|
||||
if (window.innerWidth <= 924) {
|
||||
bufferHeight = Math.max(0, bufferHeight - 32);
|
||||
}
|
||||
|
||||
messagesContainer.style.paddingBottom = `${bufferHeight}px`;
|
||||
}
|
||||
}
|
||||
|
|
@ -260,13 +266,19 @@ function doSyntaxHighlighting() {
|
|||
codeBlock.classList.add("pretty_scrollbar");
|
||||
});
|
||||
|
||||
renderMathInElement(messageBody, {
|
||||
delimiters: [
|
||||
{ left: "$$", right: "$$", display: true },
|
||||
{ left: "$", right: "$", display: false },
|
||||
{ left: "\\(", right: "\\)", display: false },
|
||||
{ left: "\\[", right: "\\]", display: true },
|
||||
],
|
||||
// Only render math in visible elements
|
||||
const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
|
||||
mathContainers.forEach(container => {
|
||||
if (isElementVisibleOnScreen(container)) {
|
||||
renderMathInElement(container, {
|
||||
delimiters: [
|
||||
{ left: "$$", right: "$$", display: true },
|
||||
{ left: "$", right: "$", display: false },
|
||||
{ left: "\\(", right: "\\)", display: false },
|
||||
{ left: "\\[", right: "\\]", display: true },
|
||||
],
|
||||
});
|
||||
}
|
||||
});
|
||||
} else if (hasSeenVisible) {
|
||||
// We've seen visible messages but this one is not visible
|
||||
|
|
@ -1065,3 +1077,30 @@ document.fonts.addEventListener("loadingdone", (event) => {
|
|||
}
|
||||
}, 50);
|
||||
});
|
||||
|
||||
(function() {
|
||||
const chatParent = document.querySelector(".chat-parent");
|
||||
const chatInputRow = document.querySelector("#chat-input-row");
|
||||
const originalMarginBottom = 75;
|
||||
let originalHeight = chatInputRow.offsetHeight;
|
||||
|
||||
function updateMargin() {
|
||||
const currentHeight = chatInputRow.offsetHeight;
|
||||
const heightDifference = currentHeight - originalHeight;
|
||||
chatParent.style.marginBottom = `${originalMarginBottom + heightDifference}px`;
|
||||
}
|
||||
|
||||
// Watch for changes that might affect height
|
||||
const observer = new MutationObserver(updateMargin);
|
||||
observer.observe(chatInputRow, {
|
||||
childList: true,
|
||||
subtree: true,
|
||||
attributes: true
|
||||
});
|
||||
|
||||
// Also listen for window resize
|
||||
window.addEventListener("resize", updateMargin);
|
||||
|
||||
// Initial call to set the margin based on current state
|
||||
updateMargin();
|
||||
})();
|
||||
|
|
|
|||
|
|
@ -20,12 +20,6 @@ function toggle_controls(value) {
|
|||
extensions.style.display = "inherit";
|
||||
}
|
||||
|
||||
// Remove bigchat classes
|
||||
chatParent.classList.remove("bigchat");
|
||||
document.getElementById("chat-input-row").classList.remove("bigchat");
|
||||
document.getElementById("chat-col").classList.remove("bigchat");
|
||||
document.getElementById("chat-tab").style.paddingBottom = "";
|
||||
|
||||
let gallery_element = document.getElementById("gallery-extension");
|
||||
if (gallery_element) {
|
||||
gallery_element.style.display = "block";
|
||||
|
|
@ -47,11 +41,5 @@ function toggle_controls(value) {
|
|||
if (extensions) {
|
||||
extensions.style.display = "none";
|
||||
}
|
||||
|
||||
// Add bigchat classes
|
||||
chatParent.classList.add("bigchat");
|
||||
document.getElementById("chat-input-row").classList.add("bigchat");
|
||||
document.getElementById("chat-col").classList.add("bigchat");
|
||||
document.getElementById("chat-tab").style.paddingBottom = "0px";
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,7 +38,6 @@ def my_get(url, **kwargs):
|
|||
return requests.api.request('get', 'http://127.0.0.1/', **kwargs)
|
||||
|
||||
|
||||
# Kindly provided by our friend WizardLM-30B
|
||||
def my_open(*args, **kwargs):
|
||||
filename = str(args[0])
|
||||
if filename.endswith(('index.html', 'share.html')):
|
||||
|
|
@ -52,6 +51,11 @@ def my_open(*args, **kwargs):
|
|||
file_contents = file_contents.replace('cdnjs.cloudflare.com', '127.0.0.1')
|
||||
file_contents = file_contents.replace(
|
||||
'</head>',
|
||||
'\n <link rel="preload" href="file/css/Inter/Inter-VariableFont_opsz,wght.ttf" as="font" type="font/ttf" crossorigin>'
|
||||
'\n <link rel="preload" href="file/css/Inter/Inter-Italic-VariableFont_opsz,wght.ttf" as="font" type="font/ttf" crossorigin>'
|
||||
'\n <link rel="preload" href="file/css/NotoSans/NotoSans-Medium.woff2" as="font" type="font/woff2" crossorigin>'
|
||||
'\n <link rel="preload" href="file/css/NotoSans/NotoSans-MediumItalic.woff2" as="font" type="font/woff2" crossorigin>'
|
||||
'\n <link rel="preload" href="file/css/NotoSans/NotoSans-Bold.woff2" as="font" type="font/woff2" crossorigin>'
|
||||
'\n <script src="file/js/katex/katex.min.js"></script>'
|
||||
'\n <script src="file/js/katex/auto-render.min.js"></script>'
|
||||
'\n <script src="file/js/highlightjs/highlight.min.js"></script>'
|
||||
|
|
|
|||
322
modules/chat.py
322
modules/chat.py
|
|
@ -86,74 +86,6 @@ yaml.add_representer(str, str_presenter)
|
|||
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
|
||||
|
||||
|
||||
def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True):
|
||||
'''
|
||||
Given a Jinja template, reverse-engineers the prefix and the suffix for
|
||||
an assistant message (if impersonate=False) or an user message
|
||||
(if impersonate=True)
|
||||
'''
|
||||
|
||||
if impersonate:
|
||||
messages = [
|
||||
{"role": "user", "content": "<<|user-message-1|>>"},
|
||||
{"role": "user", "content": "<<|user-message-2|>>"},
|
||||
]
|
||||
else:
|
||||
messages = [
|
||||
{"role": "assistant", "content": "<<|user-message-1|>>"},
|
||||
{"role": "assistant", "content": "<<|user-message-2|>>"},
|
||||
]
|
||||
|
||||
prompt = renderer(messages=messages)
|
||||
|
||||
suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0]
|
||||
suffix = prompt.split("<<|user-message-2|>>")[1]
|
||||
prefix = suffix_plus_prefix[len(suffix):]
|
||||
|
||||
if strip_trailing_spaces:
|
||||
prefix = prefix.rstrip(' ')
|
||||
|
||||
return prefix, suffix
|
||||
|
||||
|
||||
def get_thinking_suppression_string(template):
|
||||
"""
|
||||
Determines what string needs to be added to suppress thinking mode
|
||||
by comparing template renderings with thinking enabled vs disabled.
|
||||
"""
|
||||
|
||||
# Render with thinking enabled
|
||||
with_thinking = template.render(
|
||||
messages=[{'role': 'user', 'content': ''}],
|
||||
builtin_tools=None,
|
||||
tools=None,
|
||||
tools_in_user_message=False,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=True
|
||||
)
|
||||
|
||||
# Render with thinking disabled
|
||||
without_thinking = template.render(
|
||||
messages=[{'role': 'user', 'content': ''}],
|
||||
builtin_tools=None,
|
||||
tools=None,
|
||||
tools_in_user_message=False,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False
|
||||
)
|
||||
|
||||
# Find the difference (what gets added to suppress thinking)
|
||||
i = 0
|
||||
while i < min(len(with_thinking), len(without_thinking)) and with_thinking[i] == without_thinking[i]:
|
||||
i += 1
|
||||
|
||||
j = 0
|
||||
while j < min(len(with_thinking), len(without_thinking)) - i and with_thinking[-1 - j] == without_thinking[-1 - j]:
|
||||
j += 1
|
||||
|
||||
return without_thinking[i:len(without_thinking) - j if j else None]
|
||||
|
||||
|
||||
def generate_chat_prompt(user_input, state, **kwargs):
|
||||
impersonate = kwargs.get('impersonate', False)
|
||||
_continue = kwargs.get('_continue', False)
|
||||
|
|
@ -176,7 +108,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
|
|||
tools=state['tools'] if 'tools' in state else None,
|
||||
tools_in_user_message=False,
|
||||
add_generation_prompt=False,
|
||||
reasoning_effort=state['reasoning_effort']
|
||||
enable_thinking=state['enable_thinking'],
|
||||
reasoning_effort=state['reasoning_effort'],
|
||||
thinking_budget=-1 if state.get('enable_thinking', True) else 0
|
||||
)
|
||||
|
||||
chat_renderer = partial(
|
||||
|
|
@ -213,13 +147,11 @@ def generate_chat_prompt(user_input, state, **kwargs):
|
|||
if assistant_msg:
|
||||
# Handle GPT-OSS as a special case
|
||||
if '<|channel|>analysis<|message|>' in assistant_msg or '<|channel|>final<|message|>' in assistant_msg:
|
||||
|
||||
thinking_content = ""
|
||||
final_content = ""
|
||||
|
||||
# Extract analysis content if present
|
||||
if '<|channel|>analysis<|message|>' in assistant_msg:
|
||||
# Split the message by the analysis tag to isolate the content that follows
|
||||
parts = assistant_msg.split('<|channel|>analysis<|message|>', 1)
|
||||
if len(parts) > 1:
|
||||
# The content is everything after the tag
|
||||
|
|
@ -240,7 +172,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
|
|||
# Extract final content if present
|
||||
final_tag_to_find = '<|channel|>final<|message|>'
|
||||
if final_tag_to_find in assistant_msg:
|
||||
# Split the message by the final tag to isolate the content that follows
|
||||
parts = assistant_msg.split(final_tag_to_find, 1)
|
||||
if len(parts) > 1:
|
||||
# The content is everything after the tag
|
||||
|
|
@ -260,7 +191,32 @@ def generate_chat_prompt(user_input, state, **kwargs):
|
|||
|
||||
messages.insert(insert_pos, msg_dict)
|
||||
|
||||
# Handle Seed-OSS
|
||||
elif '<seed:think>' in assistant_msg:
|
||||
thinking_content = ""
|
||||
final_content = assistant_msg
|
||||
|
||||
# Extract thinking content if present
|
||||
if '<seed:think>' in assistant_msg:
|
||||
parts = assistant_msg.split('<seed:think>', 1)
|
||||
if len(parts) > 1:
|
||||
potential_content = parts[1]
|
||||
if '</seed:think>' in potential_content:
|
||||
thinking_content = potential_content.split('</seed:think>', 1)[0].strip()
|
||||
final_content = parts[0] + potential_content.split('</seed:think>', 1)[1]
|
||||
else:
|
||||
thinking_content = potential_content.strip()
|
||||
final_content = parts[0]
|
||||
|
||||
# Insert as structured message
|
||||
msg_dict = {"role": "assistant", "content": final_content.strip()}
|
||||
if thinking_content:
|
||||
msg_dict["reasoning_content"] = thinking_content
|
||||
|
||||
messages.insert(insert_pos, msg_dict)
|
||||
|
||||
else:
|
||||
# Default case (used by all other models)
|
||||
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
|
||||
|
||||
if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
|
||||
|
|
@ -286,125 +242,120 @@ def generate_chat_prompt(user_input, state, **kwargs):
|
|||
else:
|
||||
attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
|
||||
|
||||
if image_refs or attachments_text:
|
||||
enhanced_user_msg = user_msg
|
||||
if image_refs:
|
||||
enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}"
|
||||
if attachments_text:
|
||||
enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
|
||||
if image_refs:
|
||||
enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}"
|
||||
if attachments_text:
|
||||
enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
|
||||
|
||||
messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
|
||||
|
||||
# Handle the current user input
|
||||
user_input = user_input.strip()
|
||||
|
||||
# Check if we have attachments even with empty input
|
||||
has_attachments = False
|
||||
if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
|
||||
current_row_idx = len(history)
|
||||
user_key = f"user_{current_row_idx}"
|
||||
has_attachments = user_key in metadata and "attachments" in metadata[user_key]
|
||||
|
||||
if (user_input or has_attachments) and not impersonate and not _continue:
|
||||
# For the current user input being processed, check if we need to add attachments
|
||||
if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
|
||||
# Check if we have attachments
|
||||
if not (impersonate or _continue):
|
||||
has_attachments = False
|
||||
if len(history_data.get('metadata', {})) > 0:
|
||||
current_row_idx = len(history)
|
||||
user_key = f"user_{current_row_idx}"
|
||||
has_attachments = user_key in metadata and "attachments" in metadata[user_key]
|
||||
|
||||
if user_key in metadata and "attachments" in metadata[user_key]:
|
||||
attachments_text = ""
|
||||
image_refs = ""
|
||||
if user_input or has_attachments:
|
||||
# For the current user input being processed, check if we need to add attachments
|
||||
if len(history_data.get('metadata', {})) > 0:
|
||||
current_row_idx = len(history)
|
||||
user_key = f"user_{current_row_idx}"
|
||||
|
||||
for attachment in metadata[user_key]["attachments"]:
|
||||
if attachment.get("type") == "image":
|
||||
image_refs += "<__media__>"
|
||||
else:
|
||||
filename = attachment.get("name", "file")
|
||||
content = attachment.get("content", "")
|
||||
if attachment.get("type") == "text/html" and attachment.get("url"):
|
||||
attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
|
||||
if user_key in metadata and "attachments" in metadata[user_key]:
|
||||
attachments_text = ""
|
||||
image_refs = ""
|
||||
|
||||
for attachment in metadata[user_key]["attachments"]:
|
||||
if attachment.get("type") == "image":
|
||||
image_refs += "<__media__>"
|
||||
else:
|
||||
attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
|
||||
filename = attachment.get("name", "file")
|
||||
content = attachment.get("content", "")
|
||||
if attachment.get("type") == "text/html" and attachment.get("url"):
|
||||
attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n"
|
||||
else:
|
||||
attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
|
||||
|
||||
if image_refs or attachments_text:
|
||||
user_input = user_input
|
||||
if image_refs:
|
||||
user_input = f"{image_refs}\n\n{user_input}"
|
||||
if attachments_text:
|
||||
user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
|
||||
|
||||
messages.append({"role": "user", "content": user_input})
|
||||
messages.append({"role": "user", "content": user_input})
|
||||
|
||||
if impersonate and state['mode'] != 'chat-instruct':
|
||||
messages.append({"role": "user", "content": "fake user message replace me"})
|
||||
|
||||
def make_prompt(messages):
|
||||
if state['mode'] == 'chat-instruct' and _continue:
|
||||
prompt = renderer(messages=messages[:-1])
|
||||
last_message = messages[-1].copy()
|
||||
if _continue:
|
||||
if state['mode'] == 'chat-instruct':
|
||||
messages = messages[:-1]
|
||||
else:
|
||||
messages[-1]["content"] = "fake assistant message replace me"
|
||||
messages.append({"role": "assistant", "content": "this will get deleted"})
|
||||
|
||||
if state['mode'] != 'chat-instruct':
|
||||
add_generation_prompt = (not _continue and not impersonate)
|
||||
else:
|
||||
prompt = renderer(messages=messages)
|
||||
add_generation_prompt = False
|
||||
|
||||
prompt = renderer(
|
||||
messages=messages,
|
||||
add_generation_prompt=add_generation_prompt
|
||||
)
|
||||
|
||||
if state['mode'] == 'chat-instruct':
|
||||
outer_messages = []
|
||||
if state['custom_system_message'].strip() != '':
|
||||
outer_messages.append({"role": "system", "content": state['custom_system_message']})
|
||||
|
||||
command = state['chat-instruct_command']
|
||||
command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1'])
|
||||
command = command.replace('<|prompt|>', prompt)
|
||||
command = replace_character_names(command, state['name1'], state['name2'])
|
||||
|
||||
if _continue:
|
||||
prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0]
|
||||
prefix += messages[-1]["content"]
|
||||
else:
|
||||
prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
|
||||
if not impersonate:
|
||||
prefix = apply_extensions('bot_prefix', prefix, state)
|
||||
outer_messages = []
|
||||
if state['custom_system_message'].strip() != '':
|
||||
outer_messages.append({"role": "system", "content": state['custom_system_message']})
|
||||
|
||||
outer_messages.append({"role": "user", "content": command})
|
||||
outer_messages.append({"role": "assistant", "content": prefix})
|
||||
if _continue:
|
||||
outer_messages.append(last_message.copy())
|
||||
outer_messages[-1]["content"] = "fake assistant message replace me"
|
||||
outer_messages.append({"role": "assistant", "content": "this will get deleted"})
|
||||
|
||||
prompt = instruct_renderer(messages=outer_messages)
|
||||
suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
|
||||
if len(suffix) > 0:
|
||||
prompt = prompt[:-len(suffix)]
|
||||
else:
|
||||
# Handle GPT-OSS as a special case when continuing
|
||||
if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']:
|
||||
last_message_to_continue = messages[-1]
|
||||
prompt = renderer(messages=messages[:-1])
|
||||
prompt = instruct_renderer(
|
||||
messages=outer_messages,
|
||||
add_generation_prompt=not _continue
|
||||
)
|
||||
|
||||
# Start the assistant turn wrapper
|
||||
assistant_reply_so_far = "<|start|>assistant"
|
||||
if _continue:
|
||||
prompt = prompt.split("fake assistant message replace me", 1)[0]
|
||||
|
||||
if 'thinking' in last_message_to_continue:
|
||||
assistant_reply_so_far += f"<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>"
|
||||
content = last_message.get("content", "")
|
||||
partial_thought = last_message.get("thinking", "") or last_message.get("reasoning_content", "")
|
||||
|
||||
assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}"
|
||||
|
||||
prompt += assistant_reply_so_far
|
||||
|
||||
else:
|
||||
prompt = renderer(messages=messages)
|
||||
if _continue:
|
||||
suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
|
||||
if len(suffix) > 0:
|
||||
prompt = prompt[:-len(suffix)]
|
||||
# Handle partial thinking blocks (GPT-OSS and Seed-OSS)
|
||||
if not content and partial_thought and partial_thought.strip():
|
||||
search_string = partial_thought.strip()
|
||||
index = prompt.rfind(search_string)
|
||||
if index != -1:
|
||||
prompt = prompt[:index] + partial_thought
|
||||
else:
|
||||
prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
|
||||
# Fallback if search fails: just append the thought
|
||||
prompt += partial_thought
|
||||
else:
|
||||
# All other cases
|
||||
prompt += content
|
||||
|
||||
# Handle GPT-OSS as a special case when not continuing
|
||||
if '<|channel|>final<|message|>' in state['instruction_template_str']:
|
||||
if prefix.endswith("<|channel|>final<|message|>"):
|
||||
prefix = prefix[:-len("<|channel|>final<|message|>")]
|
||||
if impersonate:
|
||||
prompt = prompt.split("fake user message replace me", 1)[0]
|
||||
prompt += user_input
|
||||
|
||||
if impersonate:
|
||||
prefix += "<|message|>"
|
||||
|
||||
if state['mode'] == 'chat' and not impersonate:
|
||||
prefix = apply_extensions('bot_prefix', prefix, state)
|
||||
|
||||
prompt += prefix
|
||||
|
||||
if state['mode'] == 'instruct' and 'enable_thinking' in state['instruction_template_str'] and not any((_continue, impersonate, state['enable_thinking'])):
|
||||
prompt += get_thinking_suppression_string(instruction_template)
|
||||
if state['mode'] in ['chat', 'chat-instruct'] and not impersonate and not _continue:
|
||||
prompt += apply_extensions('bot_prefix', "", state)
|
||||
|
||||
return prompt
|
||||
|
||||
|
|
@ -525,29 +476,48 @@ def get_stopping_strings(state):
|
|||
renderer = partial(template.render, add_generation_prompt=False)
|
||||
renderers.append(renderer)
|
||||
|
||||
if state['mode'] in ['chat', 'chat-instruct']:
|
||||
if state['mode'] in ['chat']:
|
||||
template = jinja_env.from_string(state['chat_template_str'])
|
||||
renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2'])
|
||||
renderers.append(renderer)
|
||||
|
||||
for renderer in renderers:
|
||||
prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False)
|
||||
prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True)
|
||||
fake_messages = [
|
||||
{"role": "user", "content": "first user message"},
|
||||
{"role": "assistant", "content": "first assistant message"},
|
||||
{"role": "user", "content": "second user message"},
|
||||
{"role": "assistant", "content": "second assistant message"},
|
||||
]
|
||||
|
||||
stopping_strings += [
|
||||
suffix_user + prefix_bot,
|
||||
suffix_user + prefix_user,
|
||||
suffix_bot + prefix_bot,
|
||||
suffix_bot + prefix_user,
|
||||
stopping_strings = []
|
||||
for renderer in renderers:
|
||||
prompt = renderer(messages=fake_messages)
|
||||
|
||||
# Find positions of each message content
|
||||
first_user_end = prompt.find("first user message") + len("first user message")
|
||||
first_assistant_start = prompt.find("first assistant message")
|
||||
first_assistant_end = prompt.find("first assistant message") + len("first assistant message")
|
||||
second_user_start = prompt.find("second user message")
|
||||
second_assistant_end = prompt.find("second assistant message") + len("second assistant message")
|
||||
|
||||
# Extract pieces of text potentially containing unique stopping strings
|
||||
texts = [
|
||||
prompt[first_user_end:first_assistant_start],
|
||||
prompt[first_assistant_end:second_user_start],
|
||||
prompt[second_assistant_end:]
|
||||
]
|
||||
|
||||
# Try to find the EOT token
|
||||
for item in stopping_strings.copy():
|
||||
item = item.strip()
|
||||
if item.startswith("<") and ">" in item:
|
||||
stopping_strings.append(item.split(">")[0] + ">")
|
||||
elif item.startswith("[") and "]" in item:
|
||||
stopping_strings.append(item.split("]")[0] + "]")
|
||||
for text in texts:
|
||||
stripped_text = text.strip()
|
||||
if stripped_text.startswith("<") and ">" in stripped_text:
|
||||
stopping_strings.append(stripped_text.split(">")[0] + ">")
|
||||
elif stripped_text.startswith("[") and "]" in stripped_text:
|
||||
stopping_strings.append(stripped_text.split("]")[0] + "]")
|
||||
elif stripped_text.startswith("(") and ")" in stripped_text:
|
||||
stopping_strings.append(stripped_text.split(")")[0] + ")")
|
||||
elif stripped_text.startswith("{") and "}" in stripped_text:
|
||||
stopping_strings.append(stripped_text.split("}")[0] + "}")
|
||||
elif ":" in text:
|
||||
stopping_strings.append(text.split(":")[0] + ":")
|
||||
|
||||
if 'stopping_strings' in state and isinstance(state['stopping_strings'], list):
|
||||
stopping_strings += state.pop('stopping_strings')
|
||||
|
|
@ -765,6 +735,8 @@ def generate_search_query(user_message, state):
|
|||
query = query.rsplit("</think>", 1)[1]
|
||||
elif "<|start|>assistant<|channel|>final<|message|>" in query:
|
||||
query = query.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1]
|
||||
elif "</seed:think>" in query:
|
||||
query = query.rsplit("</seed:think>", 1)[1]
|
||||
|
||||
# Strip and remove surrounding quotes if present
|
||||
query = query.strip()
|
||||
|
|
@ -906,6 +878,12 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
|
|||
|
||||
# Extract the reply
|
||||
if state['mode'] in ['chat', 'chat-instruct']:
|
||||
reply = reply.lstrip()
|
||||
if reply.startswith(state['name2'] + ':'):
|
||||
reply = reply[len(state['name2'] + ':'):]
|
||||
elif reply.startswith(state['name1'] + ':'):
|
||||
reply = reply[len(state['name1'] + ':'):]
|
||||
|
||||
visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
|
||||
else:
|
||||
visible_reply = reply
|
||||
|
|
|
|||
|
|
@ -137,7 +137,7 @@ def extract_thinking_block(string):
|
|||
remaining_content = string[content_start:]
|
||||
return thinking_content, remaining_content
|
||||
|
||||
# If think tags not found, try alternative format
|
||||
# If think tags not found, try GPT-OSS alternative format
|
||||
ALT_START = "<|channel|>analysis<|message|>"
|
||||
ALT_END = "<|end|>"
|
||||
ALT_CONTENT_START = "<|start|>assistant<|channel|>final<|message|>"
|
||||
|
|
@ -168,7 +168,31 @@ def extract_thinking_block(string):
|
|||
remaining_content = string[content_start:]
|
||||
return thinking_content, remaining_content
|
||||
|
||||
# Return if neither format is found
|
||||
# Try seed:think format
|
||||
SEED_START = "<seed:think>"
|
||||
SEED_END = "</seed:think>"
|
||||
|
||||
seed_start_pos = string.find(SEED_START)
|
||||
seed_end_pos = string.find(SEED_END)
|
||||
|
||||
if seed_start_pos != -1 or seed_end_pos != -1:
|
||||
if seed_start_pos == -1:
|
||||
thought_start = 0
|
||||
else:
|
||||
thought_start = seed_start_pos + len(SEED_START)
|
||||
|
||||
if seed_end_pos == -1:
|
||||
thought_end = len(string)
|
||||
content_start = len(string)
|
||||
else:
|
||||
thought_end = seed_end_pos
|
||||
content_start = seed_end_pos + len(SEED_END)
|
||||
|
||||
thinking_content = string[thought_start:thought_end]
|
||||
remaining_content = string[content_start:]
|
||||
return thinking_content, remaining_content
|
||||
|
||||
# Return if no format is found
|
||||
return None, string
|
||||
|
||||
|
||||
|
|
@ -219,6 +243,27 @@ def process_markdown_content(string):
|
|||
if not string:
|
||||
return ""
|
||||
|
||||
# Define a unique placeholder for LaTeX asterisks
|
||||
LATEX_ASTERISK_PLACEHOLDER = "LATEXASTERISKPLACEHOLDER"
|
||||
|
||||
def protect_asterisks_in_latex(match):
|
||||
"""A replacer function for re.sub to protect asterisks in multiple LaTeX formats."""
|
||||
# Check which delimiter group was captured
|
||||
if match.group(1) is not None: # Content from $$...$$
|
||||
content = match.group(1)
|
||||
modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
|
||||
return f'$${modified_content}$$'
|
||||
elif match.group(2) is not None: # Content from \[...\]
|
||||
content = match.group(2)
|
||||
modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
|
||||
return f'\\[{modified_content}\\]'
|
||||
elif match.group(3) is not None: # Content from \(...\)
|
||||
content = match.group(3)
|
||||
modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER)
|
||||
return f'\\({modified_content}\\)'
|
||||
|
||||
return match.group(0) # Fallback
|
||||
|
||||
# Make \[ \] LaTeX equations inline
|
||||
pattern = r'^\s*\\\[\s*\n([\s\S]*?)\n\s*\\\]\s*$'
|
||||
replacement = r'\\[ \1 \\]'
|
||||
|
|
@ -248,6 +293,10 @@ def process_markdown_content(string):
|
|||
string = string.replace('\\end{equation*}', '$$')
|
||||
string = re.sub(r"(.)```", r"\1\n```", string)
|
||||
|
||||
# Protect asterisks within all LaTeX blocks before markdown conversion
|
||||
latex_pattern = re.compile(r'\$\$(.*?)\$\$|\\\[(.*?)\\\]|\\\((.*?)\\\)', re.DOTALL)
|
||||
string = latex_pattern.sub(protect_asterisks_in_latex, string)
|
||||
|
||||
result = ''
|
||||
is_code = False
|
||||
is_latex = False
|
||||
|
|
@ -306,6 +355,9 @@ def process_markdown_content(string):
|
|||
# Convert to HTML using markdown
|
||||
html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()])
|
||||
|
||||
# Restore the LaTeX asterisks after markdown conversion
|
||||
html_output = html_output.replace(LATEX_ASTERISK_PLACEHOLDER, '*')
|
||||
|
||||
# Remove extra newlines before </code>
|
||||
html_output = re.sub(r'\s*</code>', '</code>', html_output)
|
||||
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ from modules.image_utils import (
|
|||
convert_pil_to_base64
|
||||
)
|
||||
from modules.logging_colors import logger
|
||||
from modules.utils import resolve_model_path
|
||||
|
||||
llamacpp_valid_cache_types = {"fp16", "q8_0", "q4_0"}
|
||||
|
||||
|
|
@ -192,7 +193,7 @@ class LlamaServer:
|
|||
|
||||
if shared.args.verbose:
|
||||
logger.info("GENERATE_PARAMS=")
|
||||
printable_payload = {k: (v if k != "prompt" else "[multimodal object]" if pil_images else v) for k, v in payload.items()}
|
||||
printable_payload = {k: v for k, v in payload.items() if k != "prompt"}
|
||||
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
|
||||
print()
|
||||
|
||||
|
|
@ -315,10 +316,9 @@ class LlamaServer:
|
|||
"--batch-size", str(shared.args.batch_size),
|
||||
"--port", str(self.port),
|
||||
"--no-webui",
|
||||
"--flash-attn", "on",
|
||||
]
|
||||
|
||||
if shared.args.flash_attn:
|
||||
cmd.append("--flash-attn")
|
||||
if shared.args.threads > 0:
|
||||
cmd += ["--threads", str(shared.args.threads)]
|
||||
if shared.args.threads_batch > 0:
|
||||
|
|
@ -351,14 +351,12 @@ class LlamaServer:
|
|||
if path.exists():
|
||||
cmd += ["--mmproj", str(path)]
|
||||
if shared.args.model_draft not in [None, 'None']:
|
||||
path = Path(shared.args.model_draft)
|
||||
if not path.exists():
|
||||
path = Path(f'{shared.args.model_dir}/{shared.args.model_draft}')
|
||||
path = resolve_model_path(shared.args.model_draft)
|
||||
|
||||
if path.is_file():
|
||||
model_file = path
|
||||
else:
|
||||
model_file = sorted(Path(f'{shared.args.model_dir}/{shared.args.model_draft}').glob('*.gguf'))[0]
|
||||
model_file = sorted(path.glob('*.gguf'))[0]
|
||||
|
||||
cmd += ["--model-draft", model_file]
|
||||
if shared.args.draft_max > 0:
|
||||
|
|
@ -411,8 +409,7 @@ class LlamaServer:
|
|||
self.process = subprocess.Popen(
|
||||
cmd,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
bufsize=0,
|
||||
env=env
|
||||
)
|
||||
|
||||
|
|
@ -474,34 +471,55 @@ def filter_stderr_with_progress(process_stderr):
|
|||
last_was_progress = False
|
||||
|
||||
try:
|
||||
for raw in iter(process_stderr.readline, ''):
|
||||
line = raw.rstrip('\r\n')
|
||||
match = progress_re.search(line)
|
||||
# Read in binary mode and decode manually
|
||||
buffer = b""
|
||||
while True:
|
||||
# Read chunks aggressively to prevent buffer overflow
|
||||
chunk = process_stderr.read(4096)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
if match:
|
||||
progress = float(match.group(1))
|
||||
buffer += chunk
|
||||
|
||||
# Extract just the part from "prompt processing" onwards
|
||||
prompt_processing_idx = line.find('prompt processing')
|
||||
if prompt_processing_idx != -1:
|
||||
display_line = line[prompt_processing_idx:]
|
||||
else:
|
||||
display_line = line # fallback to full line
|
||||
# Process complete lines
|
||||
while b'\n' in buffer:
|
||||
line_bytes, buffer = buffer.split(b'\n', 1)
|
||||
try:
|
||||
line = line_bytes.decode('utf-8', errors='replace').strip('\r\n')
|
||||
if line: # Process non-empty lines
|
||||
match = progress_re.search(line)
|
||||
|
||||
# choose carriage return for in-progress or newline at completion
|
||||
end_char = '\r' if progress < 1.0 else '\n'
|
||||
print(display_line, end=end_char, file=sys.stderr, flush=True)
|
||||
last_was_progress = (progress < 1.0)
|
||||
if match:
|
||||
progress = float(match.group(1))
|
||||
|
||||
# skip noise lines
|
||||
elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line):
|
||||
# if we were in progress, finish that line first
|
||||
if last_was_progress:
|
||||
print(file=sys.stderr)
|
||||
# Extract just the part from "prompt processing" onwards
|
||||
prompt_processing_idx = line.find('prompt processing')
|
||||
if prompt_processing_idx != -1:
|
||||
display_line = line[prompt_processing_idx:]
|
||||
else:
|
||||
display_line = line # fallback to full line
|
||||
|
||||
print(line, file=sys.stderr, flush=True)
|
||||
last_was_progress = False
|
||||
# choose carriage return for in-progress or newline at completion
|
||||
end_char = '\r' if progress < 1.0 else '\n'
|
||||
print(display_line, end=end_char, file=sys.stderr, flush=True)
|
||||
last_was_progress = (progress < 1.0)
|
||||
|
||||
# skip noise lines
|
||||
elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line):
|
||||
# if we were in progress, finish that line first
|
||||
if last_was_progress:
|
||||
print(file=sys.stderr)
|
||||
|
||||
print(line, file=sys.stderr, flush=True)
|
||||
last_was_progress = False
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
except (ValueError, IOError):
|
||||
# silently ignore broken output or IO errors
|
||||
pass
|
||||
finally:
|
||||
try:
|
||||
process_stderr.close()
|
||||
except:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@ loaders_and_params = OrderedDict({
|
|||
'streaming_llm',
|
||||
'rope_freq_base',
|
||||
'compress_pos_emb',
|
||||
'flash_attn',
|
||||
'row_split',
|
||||
'no_kv_offload',
|
||||
'no_mmap',
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import modules.shared as shared
|
||||
from modules.logging_colors import logger
|
||||
from modules.models_settings import get_model_metadata
|
||||
from modules.utils import resolve_model_path
|
||||
|
||||
last_generation_time = time.time()
|
||||
|
||||
|
|
@ -45,18 +45,19 @@ def load_model(model_name, loader=None):
|
|||
model, tokenizer = output
|
||||
else:
|
||||
model = output
|
||||
if model is None:
|
||||
return None, None
|
||||
else:
|
||||
if model is not None:
|
||||
from modules.transformers_loader import load_tokenizer
|
||||
tokenizer = load_tokenizer(model_name)
|
||||
|
||||
if model is None:
|
||||
return None, None
|
||||
|
||||
shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
|
||||
if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp':
|
||||
shared.settings['truncation_length'] = shared.args.ctx_size
|
||||
|
||||
shared.is_multimodal = False
|
||||
if loader.lower() in ('exllamav3', 'llama.cpp'):
|
||||
if loader.lower() in ('exllamav3', 'llama.cpp') and hasattr(model, 'is_multimodal'):
|
||||
shared.is_multimodal = model.is_multimodal()
|
||||
|
||||
logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
|
||||
|
|
@ -69,17 +70,24 @@ def load_model(model_name, loader=None):
|
|||
def llama_cpp_server_loader(model_name):
|
||||
from modules.llama_cpp_server import LlamaServer
|
||||
|
||||
path = Path(f'{shared.args.model_dir}/{model_name}')
|
||||
path = resolve_model_path(model_name)
|
||||
|
||||
if path.is_file():
|
||||
model_file = path
|
||||
else:
|
||||
model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
|
||||
gguf_files = sorted(path.glob('*.gguf'))
|
||||
if not gguf_files:
|
||||
logger.error(f"No .gguf models found in the directory: {path}")
|
||||
return None, None
|
||||
|
||||
model_file = gguf_files[0]
|
||||
|
||||
try:
|
||||
model = LlamaServer(model_file)
|
||||
return model, model
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading the model with llama.cpp: {str(e)}")
|
||||
return None, None
|
||||
|
||||
|
||||
def transformers_loader(model_name):
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import yaml
|
|||
|
||||
from modules import chat, loaders, metadata_gguf, shared, ui
|
||||
from modules.logging_colors import logger
|
||||
from modules.utils import resolve_model_path
|
||||
|
||||
|
||||
def get_fallback_settings():
|
||||
|
|
@ -26,6 +27,7 @@ def get_fallback_settings():
|
|||
|
||||
|
||||
def get_model_metadata(model):
|
||||
model_path = resolve_model_path(model)
|
||||
model_settings = {}
|
||||
|
||||
# Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
|
||||
|
|
@ -35,7 +37,7 @@ def get_model_metadata(model):
|
|||
for k in settings[pat]:
|
||||
model_settings[k] = settings[pat][k]
|
||||
|
||||
path = Path(f'{shared.args.model_dir}/{model}/config.json')
|
||||
path = model_path / 'config.json'
|
||||
if path.exists():
|
||||
hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read())
|
||||
else:
|
||||
|
|
@ -51,7 +53,7 @@ def get_model_metadata(model):
|
|||
|
||||
# GGUF metadata
|
||||
if model_settings['loader'] == 'llama.cpp':
|
||||
path = Path(f'{shared.args.model_dir}/{model}')
|
||||
path = model_path
|
||||
if path.is_file():
|
||||
model_file = path
|
||||
else:
|
||||
|
|
@ -66,7 +68,7 @@ def get_model_metadata(model):
|
|||
metadata = load_gguf_metadata_with_cache(model_file)
|
||||
|
||||
for k in metadata:
|
||||
if k.endswith('context_length'):
|
||||
if k.endswith('.context_length'):
|
||||
model_settings['ctx_size'] = min(metadata[k], 8192)
|
||||
model_settings['truncation_length_info'] = metadata[k]
|
||||
elif k.endswith('rope.freq_base'):
|
||||
|
|
@ -92,8 +94,6 @@ def get_model_metadata(model):
|
|||
|
||||
template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
|
||||
template = re.sub(r'raise_exception\([^)]*\)', "''", template)
|
||||
template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
|
||||
template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS
|
||||
model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
|
||||
model_settings['instruction_template_str'] = template
|
||||
|
||||
|
|
@ -130,18 +130,18 @@ def get_model_metadata(model):
|
|||
model_settings['bf16'] = True
|
||||
|
||||
# Try to find the Jinja instruct template
|
||||
path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
|
||||
path = model_path / 'tokenizer_config.json'
|
||||
template = None
|
||||
|
||||
# 1. Prioritize reading from chat_template.jinja if it exists
|
||||
jinja_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.jinja'
|
||||
jinja_path = model_path / 'chat_template.jinja'
|
||||
if jinja_path.exists():
|
||||
with open(jinja_path, 'r', encoding='utf-8') as f:
|
||||
template = f.read()
|
||||
|
||||
# 2. If no .jinja file, try chat_template.json
|
||||
if template is None:
|
||||
json_template_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.json'
|
||||
json_template_path = model_path / 'chat_template.json'
|
||||
if json_template_path.exists():
|
||||
with open(json_template_path, 'r', encoding='utf-8') as f:
|
||||
json_data = json.load(f)
|
||||
|
|
@ -170,8 +170,6 @@ def get_model_metadata(model):
|
|||
|
||||
template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
|
||||
template = re.sub(r'raise_exception\([^)]*\)', "''", template)
|
||||
template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
|
||||
template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS
|
||||
model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
|
||||
model_settings['instruction_template_str'] = template
|
||||
|
||||
|
|
@ -201,7 +199,7 @@ def get_model_metadata(model):
|
|||
|
||||
|
||||
def infer_loader(model_name, model_settings, hf_quant_method=None):
|
||||
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
||||
path_to_model = resolve_model_path(model_name)
|
||||
if not path_to_model.exists():
|
||||
loader = None
|
||||
elif shared.args.portable:
|
||||
|
|
@ -357,7 +355,7 @@ def get_model_size_mb(model_file: Path) -> float:
|
|||
|
||||
|
||||
def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
|
||||
model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
|
||||
model_file = resolve_model_path(gguf_file)
|
||||
metadata = load_gguf_metadata_with_cache(model_file)
|
||||
size_in_mb = get_model_size_mb(model_file)
|
||||
|
||||
|
|
|
|||
|
|
@ -22,8 +22,7 @@ def load_prompt(fname):
|
|||
if file_path.exists():
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
if len(text) > 0 and text[-1] == '\n':
|
||||
text = text[:-1]
|
||||
text = text.rstrip()
|
||||
|
||||
return text
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ persistent_interface_state = {}
|
|||
need_restart = False
|
||||
|
||||
# Parser copied from https://github.com/vladmandic/automatic
|
||||
parser = argparse.ArgumentParser(description="Text generation web UI", conflict_handler='resolve', add_help=True, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=55, indent_increment=2, width=200))
|
||||
parser = argparse.ArgumentParser(description="Text Generation Web UI", conflict_handler='resolve', add_help=True, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=55, indent_increment=2, width=200))
|
||||
|
||||
# Basic settings
|
||||
group = parser.add_argument_group('Basic settings')
|
||||
|
|
@ -73,7 +73,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for
|
|||
|
||||
# llama.cpp
|
||||
group = parser.add_argument_group('llama.cpp')
|
||||
group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.')
|
||||
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
|
||||
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
|
||||
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
|
||||
|
|
@ -159,9 +158,6 @@ group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 f
|
|||
group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API')
|
||||
group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')
|
||||
|
||||
# Deprecated parameters
|
||||
group = parser.add_argument_group('Deprecated')
|
||||
|
||||
# Handle CMD_FLAGS.txt
|
||||
cmd_flags_path = Path(__file__).parent.parent / "user_data" / "CMD_FLAGS.txt"
|
||||
if cmd_flags_path.exists():
|
||||
|
|
@ -203,7 +199,7 @@ settings = {
|
|||
'start_with': '',
|
||||
'mode': 'instruct',
|
||||
'chat_style': 'cai-chat',
|
||||
'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
|
||||
'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>". Reply directly, without starting the reply with the character name.\n\n<|prompt|>',
|
||||
'enable_web_search': False,
|
||||
'web_search_pages': 3,
|
||||
'prompt-notebook': '',
|
||||
|
|
@ -287,7 +283,7 @@ settings = {
|
|||
'greeting': 'How can I help you today?',
|
||||
'custom_system_message': '',
|
||||
'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}",
|
||||
'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}",
|
||||
'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n {{- name2 + ':' -}}\n{%- endif %}",
|
||||
|
||||
# Extensions
|
||||
'default_extensions': [],
|
||||
|
|
|
|||
|
|
@ -142,7 +142,6 @@ def list_model_elements():
|
|||
'num_experts_per_token',
|
||||
'load_in_8bit',
|
||||
'load_in_4bit',
|
||||
'flash_attn',
|
||||
'attn_implementation',
|
||||
'cpu',
|
||||
'disk',
|
||||
|
|
|
|||
|
|
@ -78,21 +78,21 @@ def create_ui():
|
|||
with gr.Row():
|
||||
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
|
||||
|
||||
gr.HTML("<div style='margin: 0; border-bottom: 1px solid rgba(255,255,255,0.1);'></div>")
|
||||
gr.HTML("<div class='sidebar-vertical-separator'></div>")
|
||||
|
||||
shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
|
||||
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.')
|
||||
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.')
|
||||
|
||||
gr.HTML("<div style='margin: 0; border-bottom: 1px solid rgba(255,255,255,0.1);'></div>")
|
||||
gr.HTML("<div class='sidebar-vertical-separator'></div>")
|
||||
|
||||
shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search')
|
||||
with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
|
||||
shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)
|
||||
|
||||
gr.HTML("<div style='margin: 0; border-bottom: 1px solid rgba(255,255,255,0.1);'></div>")
|
||||
gr.HTML("<div class='sidebar-vertical-separator'></div>")
|
||||
|
||||
with gr.Row():
|
||||
shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
|
||||
shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='In instruct and chat-instruct modes, the template under Parameters > Instruction template is used.', elem_id='chat-mode')
|
||||
|
||||
with gr.Row():
|
||||
shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
|
||||
|
|
@ -100,7 +100,7 @@ def create_ui():
|
|||
with gr.Row():
|
||||
shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
|
||||
|
||||
gr.HTML("<div style='margin: 0; border-bottom: 1px solid rgba(255,255,255,0.1);'></div>")
|
||||
gr.HTML("<div class='sidebar-vertical-separator'></div>")
|
||||
|
||||
with gr.Row():
|
||||
shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm')
|
||||
|
|
|
|||
|
|
@ -22,8 +22,7 @@ def create_ui():
|
|||
with gr.Row():
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
initial_text = load_prompt(shared.settings['prompt-notebook'])
|
||||
shared.gradio['textbox-default'] = gr.Textbox(value=initial_text, lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar'])
|
||||
shared.gradio['textbox-default'] = gr.Textbox(value="", lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar'])
|
||||
shared.gradio['token-counter-default'] = gr.HTML(value="<span>0</span>", elem_id="default-token-counter")
|
||||
|
||||
with gr.Row():
|
||||
|
|
|
|||
|
|
@ -50,7 +50,6 @@ def create_ui():
|
|||
|
||||
with gr.Column():
|
||||
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
|
||||
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
|
||||
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
||||
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
|
||||
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
|
||||
|
|
|
|||
|
|
@ -30,8 +30,7 @@ def create_ui():
|
|||
with gr.Column(scale=4):
|
||||
with gr.Tab('Raw'):
|
||||
with gr.Row():
|
||||
initial_text = load_prompt(shared.settings['prompt-notebook'])
|
||||
shared.gradio['textbox-notebook'] = gr.Textbox(label="", value=initial_text, lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar'])
|
||||
shared.gradio['textbox-notebook'] = gr.Textbox(label="", value="", lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar'])
|
||||
shared.gradio['token-counter-notebook'] = gr.HTML(value="<span>0</span>", elem_id="notebook-token-counter")
|
||||
|
||||
with gr.Tab('Markdown'):
|
||||
|
|
|
|||
|
|
@ -86,6 +86,19 @@ def check_model_loaded():
|
|||
return True, None
|
||||
|
||||
|
||||
def resolve_model_path(model_name_or_path):
|
||||
"""
|
||||
Resolves a model path, checking for a direct path
|
||||
before the default models directory.
|
||||
"""
|
||||
|
||||
path_candidate = Path(model_name_or_path)
|
||||
if path_candidate.exists():
|
||||
return path_candidate
|
||||
else:
|
||||
return Path(f'{shared.args.model_dir}/{model_name_or_path}')
|
||||
|
||||
|
||||
def get_available_models():
|
||||
# Get all GGUF files
|
||||
gguf_files = get_available_ggufs()
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
accelerate==1.8.*
|
||||
audioop-lts<1.0; python_version >= "3.13"
|
||||
bitsandbytes==0.46.*
|
||||
colorama
|
||||
datasets
|
||||
|
|
@ -34,8 +35,8 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
accelerate==1.8.*
|
||||
audioop-lts<1.0; python_version >= "3.13"
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
|
|
@ -33,7 +34,7 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# AMD wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
accelerate==1.8.*
|
||||
audioop-lts<1.0; python_version >= "3.13"
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
|
|
@ -33,7 +34,7 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# AMD wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
accelerate==1.8.*
|
||||
audioop-lts<1.0; python_version >= "3.13"
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
|
|
@ -33,7 +34,7 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# Mac wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
accelerate==1.8.*
|
||||
audioop-lts<1.0; python_version >= "3.13"
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
|
|
@ -33,8 +34,8 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# Mac wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
|
||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
accelerate==1.8.*
|
||||
audioop-lts<1.0; python_version >= "3.13"
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
|
|
@ -33,5 +34,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# llama.cpp (CPU only, AVX2)
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
accelerate==1.8.*
|
||||
audioop-lts<1.0; python_version >= "3.13"
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
|
|
@ -33,5 +34,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# llama.cpp (CPU only, no AVX2)
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
accelerate==1.8.*
|
||||
audioop-lts<1.0; python_version >= "3.13"
|
||||
bitsandbytes==0.46.*
|
||||
colorama
|
||||
datasets
|
||||
|
|
@ -34,8 +35,8 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
accelerate==1.8.*
|
||||
audioop-lts<1.0; python_version >= "3.13"
|
||||
colorama
|
||||
datasets
|
||||
einops
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
|
|
@ -18,5 +19,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
|
|
@ -18,5 +19,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# Mac wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
|
|
@ -18,6 +19,6 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# Mac wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
|
|
@ -18,5 +19,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# llama.cpp (CPU only, AVX2)
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
|
|
@ -18,5 +19,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# llama.cpp (CPU only, no AVX2)
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
|
|
@ -18,5 +19,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
|
|
@ -18,5 +19,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
|
|
@ -18,5 +19,5 @@ sse-starlette==1.6.5
|
|||
tiktoken
|
||||
|
||||
# CUDA wheels
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||
|
|
|
|||
27
server.py
27
server.py
|
|
@ -6,6 +6,7 @@ from pathlib import Path
|
|||
from modules import shared
|
||||
from modules.block_requests import OpenMonkeyPatch, RequestBlocker
|
||||
from modules.logging_colors import logger
|
||||
from modules.prompts import load_prompt
|
||||
|
||||
# Set up Gradio temp directory path
|
||||
gradio_temp_path = Path('user_data') / 'cache' / 'gradio'
|
||||
|
|
@ -70,7 +71,7 @@ from modules.utils import gradio
|
|||
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
logger.info("Received Ctrl+C. Shutting down Text generation web UI gracefully.")
|
||||
logger.info("Received Ctrl+C. Shutting down Text Generation Web UI gracefully.")
|
||||
|
||||
# Explicitly stop LlamaServer to avoid __del__ cleanup issues during shutdown
|
||||
if shared.model and shared.model.__class__.__name__ == 'LlamaServer':
|
||||
|
|
@ -87,7 +88,7 @@ signal.signal(signal.SIGINT, signal_handler)
|
|||
|
||||
def create_interface():
|
||||
|
||||
title = 'Text generation web UI'
|
||||
title = 'Text Generation Web UI'
|
||||
|
||||
# Password authentication
|
||||
auth = []
|
||||
|
|
@ -109,6 +110,13 @@ def create_interface():
|
|||
'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp'
|
||||
})
|
||||
|
||||
if shared.settings['prompt-notebook']:
|
||||
prompt = load_prompt(shared.settings['prompt-notebook'])
|
||||
shared.persistent_interface_state.update({
|
||||
'textbox-default': prompt,
|
||||
'textbox-notebook': prompt
|
||||
})
|
||||
|
||||
# Clear existing cache files
|
||||
for cache_file in ['pfp_character.png', 'pfp_character_thumb.png']:
|
||||
cache_path = Path(f"user_data/cache/{cache_file}")
|
||||
|
|
@ -230,7 +238,7 @@ def create_interface():
|
|||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info("Starting Text generation web UI")
|
||||
logger.info("Starting Text Generation Web UI")
|
||||
do_cmd_flags_warnings()
|
||||
|
||||
# Load custom settings
|
||||
|
|
@ -283,21 +291,14 @@ if __name__ == "__main__":
|
|||
|
||||
# If any model has been selected, load it
|
||||
if shared.model_name != 'None':
|
||||
p = Path(shared.model_name)
|
||||
if p.exists():
|
||||
model_name = p.parts[-1]
|
||||
shared.model_name = model_name
|
||||
else:
|
||||
model_name = shared.model_name
|
||||
|
||||
model_settings = get_model_metadata(model_name)
|
||||
model_settings = get_model_metadata(shared.model_name)
|
||||
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
|
||||
|
||||
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
|
||||
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
|
||||
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
|
||||
shared.args.loader,
|
||||
model_name,
|
||||
shared.model_name,
|
||||
model_settings['gpu_layers'],
|
||||
shared.args.ctx_size,
|
||||
shared.args.cache_type,
|
||||
|
|
@ -308,7 +309,7 @@ if __name__ == "__main__":
|
|||
shared.args.gpu_layers = adjusted_layers
|
||||
|
||||
# Load the model
|
||||
shared.model, shared.tokenizer = load_model(model_name)
|
||||
shared.model, shared.tokenizer = load_model(shared.model_name)
|
||||
if shared.args.lora:
|
||||
add_lora_to_model(shared.args.lora)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue