diff --git a/README.md b/README.md index 6b49cee0..d42697dd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Text generation web UI +# Text Generation Web UI A Gradio web UI for Large Language Models. @@ -238,7 +238,7 @@ usage: server.py [-h] [--multi-user] [--model MODEL] [--lora LORA [LORA ...]] [- [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] -Text generation web UI +Text Generation Web UI options: -h, --help show this help message and exit diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css index 583703c0..70fd6d4a 100644 --- a/css/chat_style-messenger.css +++ b/css/chat_style-messenger.css @@ -99,9 +99,11 @@ .message-body p em { color: rgb(110 110 110) !important; } + .editing-textarea { width: max(30rem) !important; } + .circle-you + .text .edit-control-button, .circle-you + .text .editing-textarea { color: #000 !important; } diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 3e5ebe67..6dee0a89 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -13,7 +13,9 @@ line-height: 28px !important; } -.dark .chat .message-body :is(p, li, q, em, h1, h2, h3, h4, h5, h6) { +.dark .chat .message-body :is(p,li,h1,h2,h3,h4,h5,h6), +.dark .chat .message-body em:not(:is(h1,h2,h3,h4,h5,h6,b,strong) em), +.dark .chat .message-body q:not(:is(h1,h2,h3,h4,h5,h6,b,strong) q) { color: #d1d5db !important; } diff --git a/css/main.css b/css/main.css index 062d3eb2..c7ee57da 100644 --- a/css/main.css +++ b/css/main.css @@ -404,6 +404,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { flex: 1; overflow: auto !important; border-radius: 0 !important; + margin-bottom: 75px; } .chat-parent .prose { @@ -428,10 +429,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { margin-left: 5px; } -.chat-parent.bigchat { - flex: 1; -} - .chat > .messages { display: flex; flex-direction: column; @@ -626,6 +623,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { max-width: 54rem; left: 50%; transform: translateX(-50%); + position: absolute; + bottom: 0; + background: var(--body-background-fill); } @media print { @@ -828,10 +828,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { padding: 1rem; } -#chat-input-row.bigchat { - padding-bottom: 1px !important; -} - #chat-col { height: 100dvh; display: flex; @@ -847,10 +843,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } } -#chat-col.bigchat { - padding-bottom: 15px !important; -} - .message-body ol, .message-body ul { margin-top: 0 !important; margin-bottom: 1.25em !important; @@ -1362,6 +1354,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { cursor: pointer; user-select: none; font-size: 14px; + line-height: var(--line-sm); color: rgb(0 0 0 / 70%); transition: background-color 0.2s; } @@ -1693,3 +1686,12 @@ button:focus { #chat-input span { display: none; } + +.sidebar-vertical-separator { + margin: 0; + border-bottom: var(--input-border-width) solid var(--input-border-color); +} + +.dark .sidebar-vertical-separator { + border-bottom: 1px solid rgb(255 255 255 / 10%); +} diff --git a/js/global_scope_js.js b/js/global_scope_js.js index ebed1f3d..4d8c1121 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -372,3 +372,18 @@ observer.observe(document.documentElement, { subtree: true, attributeFilter: ["style"] }); + +//------------------------------------------------ +// Suppress "Attempted to select a non-interactive or hidden tab" warning +//------------------------------------------------ +(function() { + const originalWarn = console.warn; + + console.warn = function(...args) { + if (args[0] && typeof args[0] === "string" && args[0].includes("Attempted to select a non-interactive or hidden tab")) { + return; + } + + originalWarn.apply(console, args); + }; +})(); diff --git a/js/main.js b/js/main.js index 4b4b14c2..c31621f6 100644 --- a/js/main.js +++ b/js/main.js @@ -206,7 +206,13 @@ const observer = new MutationObserver(function(mutations) { // Add padding to the messages container to create room for the last message. // The purpose of this is to avoid constant scrolling during streaming in // instruct mode. - const bufferHeight = Math.max(0, Math.max(0.7 * window.innerHeight, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight); + let bufferHeight = Math.max(0, Math.max(window.innerHeight - 128 - 84, window.innerHeight - prevSibling.offsetHeight - 84) - lastChild.offsetHeight); + + // Subtract header height when screen width is <= 924px + if (window.innerWidth <= 924) { + bufferHeight = Math.max(0, bufferHeight - 32); + } + messagesContainer.style.paddingBottom = `${bufferHeight}px`; } } @@ -260,13 +266,19 @@ function doSyntaxHighlighting() { codeBlock.classList.add("pretty_scrollbar"); }); - renderMathInElement(messageBody, { - delimiters: [ - { left: "$$", right: "$$", display: true }, - { left: "$", right: "$", display: false }, - { left: "\\(", right: "\\)", display: false }, - { left: "\\[", right: "\\]", display: true }, - ], + // Only render math in visible elements + const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt"); + mathContainers.forEach(container => { + if (isElementVisibleOnScreen(container)) { + renderMathInElement(container, { + delimiters: [ + { left: "$$", right: "$$", display: true }, + { left: "$", right: "$", display: false }, + { left: "\\(", right: "\\)", display: false }, + { left: "\\[", right: "\\]", display: true }, + ], + }); + } }); } else if (hasSeenVisible) { // We've seen visible messages but this one is not visible @@ -1065,3 +1077,30 @@ document.fonts.addEventListener("loadingdone", (event) => { } }, 50); }); + +(function() { + const chatParent = document.querySelector(".chat-parent"); + const chatInputRow = document.querySelector("#chat-input-row"); + const originalMarginBottom = 75; + let originalHeight = chatInputRow.offsetHeight; + + function updateMargin() { + const currentHeight = chatInputRow.offsetHeight; + const heightDifference = currentHeight - originalHeight; + chatParent.style.marginBottom = `${originalMarginBottom + heightDifference}px`; + } + + // Watch for changes that might affect height + const observer = new MutationObserver(updateMargin); + observer.observe(chatInputRow, { + childList: true, + subtree: true, + attributes: true + }); + + // Also listen for window resize + window.addEventListener("resize", updateMargin); + + // Initial call to set the margin based on current state + updateMargin(); +})(); diff --git a/js/show_controls.js b/js/show_controls.js index f974d412..ff513395 100644 --- a/js/show_controls.js +++ b/js/show_controls.js @@ -20,12 +20,6 @@ function toggle_controls(value) { extensions.style.display = "inherit"; } - // Remove bigchat classes - chatParent.classList.remove("bigchat"); - document.getElementById("chat-input-row").classList.remove("bigchat"); - document.getElementById("chat-col").classList.remove("bigchat"); - document.getElementById("chat-tab").style.paddingBottom = ""; - let gallery_element = document.getElementById("gallery-extension"); if (gallery_element) { gallery_element.style.display = "block"; @@ -47,11 +41,5 @@ function toggle_controls(value) { if (extensions) { extensions.style.display = "none"; } - - // Add bigchat classes - chatParent.classList.add("bigchat"); - document.getElementById("chat-input-row").classList.add("bigchat"); - document.getElementById("chat-col").classList.add("bigchat"); - document.getElementById("chat-tab").style.paddingBottom = "0px"; } } diff --git a/modules/block_requests.py b/modules/block_requests.py index dc1ee467..911e41d9 100644 --- a/modules/block_requests.py +++ b/modules/block_requests.py @@ -38,7 +38,6 @@ def my_get(url, **kwargs): return requests.api.request('get', 'http://127.0.0.1/', **kwargs) -# Kindly provided by our friend WizardLM-30B def my_open(*args, **kwargs): filename = str(args[0]) if filename.endswith(('index.html', 'share.html')): @@ -52,6 +51,11 @@ def my_open(*args, **kwargs): file_contents = file_contents.replace('cdnjs.cloudflare.com', '127.0.0.1') file_contents = file_contents.replace( '', + '\n ' + '\n ' + '\n ' + '\n ' + '\n ' '\n ' '\n ' '\n ' diff --git a/modules/chat.py b/modules/chat.py index ab6b43c0..ad2f4001 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -86,74 +86,6 @@ yaml.add_representer(str, str_presenter) yaml.representer.SafeRepresenter.add_representer(str, str_presenter) -def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True): - ''' - Given a Jinja template, reverse-engineers the prefix and the suffix for - an assistant message (if impersonate=False) or an user message - (if impersonate=True) - ''' - - if impersonate: - messages = [ - {"role": "user", "content": "<<|user-message-1|>>"}, - {"role": "user", "content": "<<|user-message-2|>>"}, - ] - else: - messages = [ - {"role": "assistant", "content": "<<|user-message-1|>>"}, - {"role": "assistant", "content": "<<|user-message-2|>>"}, - ] - - prompt = renderer(messages=messages) - - suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0] - suffix = prompt.split("<<|user-message-2|>>")[1] - prefix = suffix_plus_prefix[len(suffix):] - - if strip_trailing_spaces: - prefix = prefix.rstrip(' ') - - return prefix, suffix - - -def get_thinking_suppression_string(template): - """ - Determines what string needs to be added to suppress thinking mode - by comparing template renderings with thinking enabled vs disabled. - """ - - # Render with thinking enabled - with_thinking = template.render( - messages=[{'role': 'user', 'content': ''}], - builtin_tools=None, - tools=None, - tools_in_user_message=False, - add_generation_prompt=True, - enable_thinking=True - ) - - # Render with thinking disabled - without_thinking = template.render( - messages=[{'role': 'user', 'content': ''}], - builtin_tools=None, - tools=None, - tools_in_user_message=False, - add_generation_prompt=True, - enable_thinking=False - ) - - # Find the difference (what gets added to suppress thinking) - i = 0 - while i < min(len(with_thinking), len(without_thinking)) and with_thinking[i] == without_thinking[i]: - i += 1 - - j = 0 - while j < min(len(with_thinking), len(without_thinking)) - i and with_thinking[-1 - j] == without_thinking[-1 - j]: - j += 1 - - return without_thinking[i:len(without_thinking) - j if j else None] - - def generate_chat_prompt(user_input, state, **kwargs): impersonate = kwargs.get('impersonate', False) _continue = kwargs.get('_continue', False) @@ -176,7 +108,9 @@ def generate_chat_prompt(user_input, state, **kwargs): tools=state['tools'] if 'tools' in state else None, tools_in_user_message=False, add_generation_prompt=False, - reasoning_effort=state['reasoning_effort'] + enable_thinking=state['enable_thinking'], + reasoning_effort=state['reasoning_effort'], + thinking_budget=-1 if state.get('enable_thinking', True) else 0 ) chat_renderer = partial( @@ -213,13 +147,11 @@ def generate_chat_prompt(user_input, state, **kwargs): if assistant_msg: # Handle GPT-OSS as a special case if '<|channel|>analysis<|message|>' in assistant_msg or '<|channel|>final<|message|>' in assistant_msg: - thinking_content = "" final_content = "" # Extract analysis content if present if '<|channel|>analysis<|message|>' in assistant_msg: - # Split the message by the analysis tag to isolate the content that follows parts = assistant_msg.split('<|channel|>analysis<|message|>', 1) if len(parts) > 1: # The content is everything after the tag @@ -240,7 +172,6 @@ def generate_chat_prompt(user_input, state, **kwargs): # Extract final content if present final_tag_to_find = '<|channel|>final<|message|>' if final_tag_to_find in assistant_msg: - # Split the message by the final tag to isolate the content that follows parts = assistant_msg.split(final_tag_to_find, 1) if len(parts) > 1: # The content is everything after the tag @@ -260,7 +191,32 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.insert(insert_pos, msg_dict) + # Handle Seed-OSS + elif '' in assistant_msg: + thinking_content = "" + final_content = assistant_msg + + # Extract thinking content if present + if '' in assistant_msg: + parts = assistant_msg.split('', 1) + if len(parts) > 1: + potential_content = parts[1] + if '' in potential_content: + thinking_content = potential_content.split('', 1)[0].strip() + final_content = parts[0] + potential_content.split('', 1)[1] + else: + thinking_content = potential_content.strip() + final_content = parts[0] + + # Insert as structured message + msg_dict = {"role": "assistant", "content": final_content.strip()} + if thinking_content: + msg_dict["reasoning_content"] = thinking_content + + messages.insert(insert_pos, msg_dict) + else: + # Default case (used by all other models) messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']: @@ -286,125 +242,120 @@ def generate_chat_prompt(user_input, state, **kwargs): else: attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" - if image_refs or attachments_text: - enhanced_user_msg = user_msg - if image_refs: - enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}" - if attachments_text: - enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}" + if image_refs: + enhanced_user_msg = f"{image_refs}\n\n{enhanced_user_msg}" + if attachments_text: + enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}" messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg}) + # Handle the current user input user_input = user_input.strip() - # Check if we have attachments even with empty input - has_attachments = False - if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: - current_row_idx = len(history) - user_key = f"user_{current_row_idx}" - has_attachments = user_key in metadata and "attachments" in metadata[user_key] - - if (user_input or has_attachments) and not impersonate and not _continue: - # For the current user input being processed, check if we need to add attachments - if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: + # Check if we have attachments + if not (impersonate or _continue): + has_attachments = False + if len(history_data.get('metadata', {})) > 0: current_row_idx = len(history) user_key = f"user_{current_row_idx}" + has_attachments = user_key in metadata and "attachments" in metadata[user_key] - if user_key in metadata and "attachments" in metadata[user_key]: - attachments_text = "" - image_refs = "" + if user_input or has_attachments: + # For the current user input being processed, check if we need to add attachments + if len(history_data.get('metadata', {})) > 0: + current_row_idx = len(history) + user_key = f"user_{current_row_idx}" - for attachment in metadata[user_key]["attachments"]: - if attachment.get("type") == "image": - image_refs += "<__media__>" - else: - filename = attachment.get("name", "file") - content = attachment.get("content", "") - if attachment.get("type") == "text/html" and attachment.get("url"): - attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n" + if user_key in metadata and "attachments" in metadata[user_key]: + attachments_text = "" + image_refs = "" + + for attachment in metadata[user_key]["attachments"]: + if attachment.get("type") == "image": + image_refs += "<__media__>" else: - attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + filename = attachment.get("name", "file") + content = attachment.get("content", "") + if attachment.get("type") == "text/html" and attachment.get("url"): + attachments_text += f"\nName: {filename}\nURL: {attachment['url']}\nContents:\n\n=====\n{content}\n=====\n\n" + else: + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" - if image_refs or attachments_text: - user_input = user_input if image_refs: user_input = f"{image_refs}\n\n{user_input}" if attachments_text: user_input += f"\n\nATTACHMENTS:\n{attachments_text}" - messages.append({"role": "user", "content": user_input}) + messages.append({"role": "user", "content": user_input}) + + if impersonate and state['mode'] != 'chat-instruct': + messages.append({"role": "user", "content": "fake user message replace me"}) def make_prompt(messages): - if state['mode'] == 'chat-instruct' and _continue: - prompt = renderer(messages=messages[:-1]) + last_message = messages[-1].copy() + if _continue: + if state['mode'] == 'chat-instruct': + messages = messages[:-1] + else: + messages[-1]["content"] = "fake assistant message replace me" + messages.append({"role": "assistant", "content": "this will get deleted"}) + + if state['mode'] != 'chat-instruct': + add_generation_prompt = (not _continue and not impersonate) else: - prompt = renderer(messages=messages) + add_generation_prompt = False + + prompt = renderer( + messages=messages, + add_generation_prompt=add_generation_prompt + ) if state['mode'] == 'chat-instruct': - outer_messages = [] - if state['custom_system_message'].strip() != '': - outer_messages.append({"role": "system", "content": state['custom_system_message']}) - command = state['chat-instruct_command'] command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1']) command = command.replace('<|prompt|>', prompt) command = replace_character_names(command, state['name1'], state['name2']) - if _continue: - prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0] - prefix += messages[-1]["content"] - else: - prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] - if not impersonate: - prefix = apply_extensions('bot_prefix', prefix, state) + outer_messages = [] + if state['custom_system_message'].strip() != '': + outer_messages.append({"role": "system", "content": state['custom_system_message']}) outer_messages.append({"role": "user", "content": command}) - outer_messages.append({"role": "assistant", "content": prefix}) + if _continue: + outer_messages.append(last_message.copy()) + outer_messages[-1]["content"] = "fake assistant message replace me" + outer_messages.append({"role": "assistant", "content": "this will get deleted"}) - prompt = instruct_renderer(messages=outer_messages) - suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1] - if len(suffix) > 0: - prompt = prompt[:-len(suffix)] - else: - # Handle GPT-OSS as a special case when continuing - if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']: - last_message_to_continue = messages[-1] - prompt = renderer(messages=messages[:-1]) + prompt = instruct_renderer( + messages=outer_messages, + add_generation_prompt=not _continue + ) - # Start the assistant turn wrapper - assistant_reply_so_far = "<|start|>assistant" + if _continue: + prompt = prompt.split("fake assistant message replace me", 1)[0] - if 'thinking' in last_message_to_continue: - assistant_reply_so_far += f"<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>" + content = last_message.get("content", "") + partial_thought = last_message.get("thinking", "") or last_message.get("reasoning_content", "") - assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}" - - prompt += assistant_reply_so_far - - else: - prompt = renderer(messages=messages) - if _continue: - suffix = get_generation_prompt(renderer, impersonate=impersonate)[1] - if len(suffix) > 0: - prompt = prompt[:-len(suffix)] + # Handle partial thinking blocks (GPT-OSS and Seed-OSS) + if not content and partial_thought and partial_thought.strip(): + search_string = partial_thought.strip() + index = prompt.rfind(search_string) + if index != -1: + prompt = prompt[:index] + partial_thought else: - prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] + # Fallback if search fails: just append the thought + prompt += partial_thought + else: + # All other cases + prompt += content - # Handle GPT-OSS as a special case when not continuing - if '<|channel|>final<|message|>' in state['instruction_template_str']: - if prefix.endswith("<|channel|>final<|message|>"): - prefix = prefix[:-len("<|channel|>final<|message|>")] + if impersonate: + prompt = prompt.split("fake user message replace me", 1)[0] + prompt += user_input - if impersonate: - prefix += "<|message|>" - - if state['mode'] == 'chat' and not impersonate: - prefix = apply_extensions('bot_prefix', prefix, state) - - prompt += prefix - - if state['mode'] == 'instruct' and 'enable_thinking' in state['instruction_template_str'] and not any((_continue, impersonate, state['enable_thinking'])): - prompt += get_thinking_suppression_string(instruction_template) + if state['mode'] in ['chat', 'chat-instruct'] and not impersonate and not _continue: + prompt += apply_extensions('bot_prefix', "", state) return prompt @@ -525,29 +476,48 @@ def get_stopping_strings(state): renderer = partial(template.render, add_generation_prompt=False) renderers.append(renderer) - if state['mode'] in ['chat', 'chat-instruct']: + if state['mode'] in ['chat']: template = jinja_env.from_string(state['chat_template_str']) renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2']) renderers.append(renderer) - for renderer in renderers: - prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False) - prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True) + fake_messages = [ + {"role": "user", "content": "first user message"}, + {"role": "assistant", "content": "first assistant message"}, + {"role": "user", "content": "second user message"}, + {"role": "assistant", "content": "second assistant message"}, + ] - stopping_strings += [ - suffix_user + prefix_bot, - suffix_user + prefix_user, - suffix_bot + prefix_bot, - suffix_bot + prefix_user, + stopping_strings = [] + for renderer in renderers: + prompt = renderer(messages=fake_messages) + + # Find positions of each message content + first_user_end = prompt.find("first user message") + len("first user message") + first_assistant_start = prompt.find("first assistant message") + first_assistant_end = prompt.find("first assistant message") + len("first assistant message") + second_user_start = prompt.find("second user message") + second_assistant_end = prompt.find("second assistant message") + len("second assistant message") + + # Extract pieces of text potentially containing unique stopping strings + texts = [ + prompt[first_user_end:first_assistant_start], + prompt[first_assistant_end:second_user_start], + prompt[second_assistant_end:] ] - # Try to find the EOT token - for item in stopping_strings.copy(): - item = item.strip() - if item.startswith("<") and ">" in item: - stopping_strings.append(item.split(">")[0] + ">") - elif item.startswith("[") and "]" in item: - stopping_strings.append(item.split("]")[0] + "]") + for text in texts: + stripped_text = text.strip() + if stripped_text.startswith("<") and ">" in stripped_text: + stopping_strings.append(stripped_text.split(">")[0] + ">") + elif stripped_text.startswith("[") and "]" in stripped_text: + stopping_strings.append(stripped_text.split("]")[0] + "]") + elif stripped_text.startswith("(") and ")" in stripped_text: + stopping_strings.append(stripped_text.split(")")[0] + ")") + elif stripped_text.startswith("{") and "}" in stripped_text: + stopping_strings.append(stripped_text.split("}")[0] + "}") + elif ":" in text: + stopping_strings.append(text.split(":")[0] + ":") if 'stopping_strings' in state and isinstance(state['stopping_strings'], list): stopping_strings += state.pop('stopping_strings') @@ -765,6 +735,8 @@ def generate_search_query(user_message, state): query = query.rsplit("", 1)[1] elif "<|start|>assistant<|channel|>final<|message|>" in query: query = query.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1] + elif "" in query: + query = query.rsplit("", 1)[1] # Strip and remove surrounding quotes if present query = query.strip() @@ -906,6 +878,12 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess # Extract the reply if state['mode'] in ['chat', 'chat-instruct']: + reply = reply.lstrip() + if reply.startswith(state['name2'] + ':'): + reply = reply[len(state['name2'] + ':'):] + elif reply.startswith(state['name1'] + ':'): + reply = reply[len(state['name1'] + ':'):] + visible_reply = re.sub("(||{{user}})", state['name1'], reply) else: visible_reply = reply diff --git a/modules/html_generator.py b/modules/html_generator.py index 279f9ba6..492b52bd 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -137,7 +137,7 @@ def extract_thinking_block(string): remaining_content = string[content_start:] return thinking_content, remaining_content - # If think tags not found, try alternative format + # If think tags not found, try GPT-OSS alternative format ALT_START = "<|channel|>analysis<|message|>" ALT_END = "<|end|>" ALT_CONTENT_START = "<|start|>assistant<|channel|>final<|message|>" @@ -168,7 +168,31 @@ def extract_thinking_block(string): remaining_content = string[content_start:] return thinking_content, remaining_content - # Return if neither format is found + # Try seed:think format + SEED_START = "<seed:think>" + SEED_END = "</seed:think>" + + seed_start_pos = string.find(SEED_START) + seed_end_pos = string.find(SEED_END) + + if seed_start_pos != -1 or seed_end_pos != -1: + if seed_start_pos == -1: + thought_start = 0 + else: + thought_start = seed_start_pos + len(SEED_START) + + if seed_end_pos == -1: + thought_end = len(string) + content_start = len(string) + else: + thought_end = seed_end_pos + content_start = seed_end_pos + len(SEED_END) + + thinking_content = string[thought_start:thought_end] + remaining_content = string[content_start:] + return thinking_content, remaining_content + + # Return if no format is found return None, string @@ -219,6 +243,27 @@ def process_markdown_content(string): if not string: return "" + # Define a unique placeholder for LaTeX asterisks + LATEX_ASTERISK_PLACEHOLDER = "LATEXASTERISKPLACEHOLDER" + + def protect_asterisks_in_latex(match): + """A replacer function for re.sub to protect asterisks in multiple LaTeX formats.""" + # Check which delimiter group was captured + if match.group(1) is not None: # Content from $$...$$ + content = match.group(1) + modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER) + return f'$${modified_content}$$' + elif match.group(2) is not None: # Content from \[...\] + content = match.group(2) + modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER) + return f'\\[{modified_content}\\]' + elif match.group(3) is not None: # Content from \(...\) + content = match.group(3) + modified_content = content.replace('*', LATEX_ASTERISK_PLACEHOLDER) + return f'\\({modified_content}\\)' + + return match.group(0) # Fallback + # Make \[ \] LaTeX equations inline pattern = r'^\s*\\\[\s*\n([\s\S]*?)\n\s*\\\]\s*$' replacement = r'\\[ \1 \\]' @@ -248,6 +293,10 @@ def process_markdown_content(string): string = string.replace('\\end{equation*}', '$$') string = re.sub(r"(.)```", r"\1\n```", string) + # Protect asterisks within all LaTeX blocks before markdown conversion + latex_pattern = re.compile(r'\$\$(.*?)\$\$|\\\[(.*?)\\\]|\\\((.*?)\\\)', re.DOTALL) + string = latex_pattern.sub(protect_asterisks_in_latex, string) + result = '' is_code = False is_latex = False @@ -306,6 +355,9 @@ def process_markdown_content(string): # Convert to HTML using markdown html_output = markdown.markdown(result, extensions=['fenced_code', 'tables', SaneListExtension()]) + # Restore the LaTeX asterisks after markdown conversion + html_output = html_output.replace(LATEX_ASTERISK_PLACEHOLDER, '*') + # Remove extra newlines before html_output = re.sub(r'\s*', '', html_output) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 5953803a..38589cf2 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -20,6 +20,7 @@ from modules.image_utils import ( convert_pil_to_base64 ) from modules.logging_colors import logger +from modules.utils import resolve_model_path llamacpp_valid_cache_types = {"fp16", "q8_0", "q4_0"} @@ -192,7 +193,7 @@ class LlamaServer: if shared.args.verbose: logger.info("GENERATE_PARAMS=") - printable_payload = {k: (v if k != "prompt" else "[multimodal object]" if pil_images else v) for k, v in payload.items()} + printable_payload = {k: v for k, v in payload.items() if k != "prompt"} pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() @@ -315,10 +316,9 @@ class LlamaServer: "--batch-size", str(shared.args.batch_size), "--port", str(self.port), "--no-webui", + "--flash-attn", "on", ] - if shared.args.flash_attn: - cmd.append("--flash-attn") if shared.args.threads > 0: cmd += ["--threads", str(shared.args.threads)] if shared.args.threads_batch > 0: @@ -351,14 +351,12 @@ class LlamaServer: if path.exists(): cmd += ["--mmproj", str(path)] if shared.args.model_draft not in [None, 'None']: - path = Path(shared.args.model_draft) - if not path.exists(): - path = Path(f'{shared.args.model_dir}/{shared.args.model_draft}') + path = resolve_model_path(shared.args.model_draft) if path.is_file(): model_file = path else: - model_file = sorted(Path(f'{shared.args.model_dir}/{shared.args.model_draft}').glob('*.gguf'))[0] + model_file = sorted(path.glob('*.gguf'))[0] cmd += ["--model-draft", model_file] if shared.args.draft_max > 0: @@ -411,8 +409,7 @@ class LlamaServer: self.process = subprocess.Popen( cmd, stderr=subprocess.PIPE, - text=True, - bufsize=1, + bufsize=0, env=env ) @@ -474,34 +471,55 @@ def filter_stderr_with_progress(process_stderr): last_was_progress = False try: - for raw in iter(process_stderr.readline, ''): - line = raw.rstrip('\r\n') - match = progress_re.search(line) + # Read in binary mode and decode manually + buffer = b"" + while True: + # Read chunks aggressively to prevent buffer overflow + chunk = process_stderr.read(4096) + if not chunk: + break - if match: - progress = float(match.group(1)) + buffer += chunk - # Extract just the part from "prompt processing" onwards - prompt_processing_idx = line.find('prompt processing') - if prompt_processing_idx != -1: - display_line = line[prompt_processing_idx:] - else: - display_line = line # fallback to full line + # Process complete lines + while b'\n' in buffer: + line_bytes, buffer = buffer.split(b'\n', 1) + try: + line = line_bytes.decode('utf-8', errors='replace').strip('\r\n') + if line: # Process non-empty lines + match = progress_re.search(line) - # choose carriage return for in-progress or newline at completion - end_char = '\r' if progress < 1.0 else '\n' - print(display_line, end=end_char, file=sys.stderr, flush=True) - last_was_progress = (progress < 1.0) + if match: + progress = float(match.group(1)) - # skip noise lines - elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line): - # if we were in progress, finish that line first - if last_was_progress: - print(file=sys.stderr) + # Extract just the part from "prompt processing" onwards + prompt_processing_idx = line.find('prompt processing') + if prompt_processing_idx != -1: + display_line = line[prompt_processing_idx:] + else: + display_line = line # fallback to full line - print(line, file=sys.stderr, flush=True) - last_was_progress = False + # choose carriage return for in-progress or newline at completion + end_char = '\r' if progress < 1.0 else '\n' + print(display_line, end=end_char, file=sys.stderr, flush=True) + last_was_progress = (progress < 1.0) + + # skip noise lines + elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line): + # if we were in progress, finish that line first + if last_was_progress: + print(file=sys.stderr) + + print(line, file=sys.stderr, flush=True) + last_was_progress = False + + except Exception: + continue except (ValueError, IOError): - # silently ignore broken output or IO errors pass + finally: + try: + process_stderr.close() + except: + pass diff --git a/modules/loaders.py b/modules/loaders.py index f88e976d..fe982ab5 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -16,7 +16,6 @@ loaders_and_params = OrderedDict({ 'streaming_llm', 'rope_freq_base', 'compress_pos_emb', - 'flash_attn', 'row_split', 'no_kv_offload', 'no_mmap', diff --git a/modules/models.py b/modules/models.py index ca3d184f..9535ea82 100644 --- a/modules/models.py +++ b/modules/models.py @@ -1,10 +1,10 @@ import sys import time -from pathlib import Path import modules.shared as shared from modules.logging_colors import logger from modules.models_settings import get_model_metadata +from modules.utils import resolve_model_path last_generation_time = time.time() @@ -45,18 +45,19 @@ def load_model(model_name, loader=None): model, tokenizer = output else: model = output - if model is None: - return None, None - else: + if model is not None: from modules.transformers_loader import load_tokenizer tokenizer = load_tokenizer(model_name) + if model is None: + return None, None + shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp': shared.settings['truncation_length'] = shared.args.ctx_size shared.is_multimodal = False - if loader.lower() in ('exllamav3', 'llama.cpp'): + if loader.lower() in ('exllamav3', 'llama.cpp') and hasattr(model, 'is_multimodal'): shared.is_multimodal = model.is_multimodal() logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.") @@ -69,17 +70,24 @@ def load_model(model_name, loader=None): def llama_cpp_server_loader(model_name): from modules.llama_cpp_server import LlamaServer - path = Path(f'{shared.args.model_dir}/{model_name}') + path = resolve_model_path(model_name) + if path.is_file(): model_file = path else: - model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] + gguf_files = sorted(path.glob('*.gguf')) + if not gguf_files: + logger.error(f"No .gguf models found in the directory: {path}") + return None, None + + model_file = gguf_files[0] try: model = LlamaServer(model_file) return model, model except Exception as e: logger.error(f"Error loading the model with llama.cpp: {str(e)}") + return None, None def transformers_loader(model_name): diff --git a/modules/models_settings.py b/modules/models_settings.py index c325fa0c..6dc000b4 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -10,6 +10,7 @@ import yaml from modules import chat, loaders, metadata_gguf, shared, ui from modules.logging_colors import logger +from modules.utils import resolve_model_path def get_fallback_settings(): @@ -26,6 +27,7 @@ def get_fallback_settings(): def get_model_metadata(model): + model_path = resolve_model_path(model) model_settings = {} # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml @@ -35,7 +37,7 @@ def get_model_metadata(model): for k in settings[pat]: model_settings[k] = settings[pat][k] - path = Path(f'{shared.args.model_dir}/{model}/config.json') + path = model_path / 'config.json' if path.exists(): hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read()) else: @@ -51,7 +53,7 @@ def get_model_metadata(model): # GGUF metadata if model_settings['loader'] == 'llama.cpp': - path = Path(f'{shared.args.model_dir}/{model}') + path = model_path if path.is_file(): model_file = path else: @@ -66,7 +68,7 @@ def get_model_metadata(model): metadata = load_gguf_metadata_with_cache(model_file) for k in metadata: - if k.endswith('context_length'): + if k.endswith('.context_length'): model_settings['ctx_size'] = min(metadata[k], 8192) model_settings['truncation_length_info'] = metadata[k] elif k.endswith('rope.freq_base'): @@ -92,8 +94,6 @@ def get_model_metadata(model): template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) template = re.sub(r'raise_exception\([^)]*\)', "''", template) - template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) - template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template @@ -130,18 +130,18 @@ def get_model_metadata(model): model_settings['bf16'] = True # Try to find the Jinja instruct template - path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json' + path = model_path / 'tokenizer_config.json' template = None # 1. Prioritize reading from chat_template.jinja if it exists - jinja_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.jinja' + jinja_path = model_path / 'chat_template.jinja' if jinja_path.exists(): with open(jinja_path, 'r', encoding='utf-8') as f: template = f.read() # 2. If no .jinja file, try chat_template.json if template is None: - json_template_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.json' + json_template_path = model_path / 'chat_template.json' if json_template_path.exists(): with open(json_template_path, 'r', encoding='utf-8') as f: json_data = json.load(f) @@ -170,8 +170,6 @@ def get_model_metadata(model): template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) template = re.sub(r'raise_exception\([^)]*\)', "''", template) - template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) - template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template @@ -201,7 +199,7 @@ def get_model_metadata(model): def infer_loader(model_name, model_settings, hf_quant_method=None): - path_to_model = Path(f'{shared.args.model_dir}/{model_name}') + path_to_model = resolve_model_path(model_name) if not path_to_model.exists(): loader = None elif shared.args.portable: @@ -357,7 +355,7 @@ def get_model_size_mb(model_file: Path) -> float: def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): - model_file = Path(f'{shared.args.model_dir}/{gguf_file}') + model_file = resolve_model_path(gguf_file) metadata = load_gguf_metadata_with_cache(model_file) size_in_mb = get_model_size_mb(model_file) diff --git a/modules/prompts.py b/modules/prompts.py index 79d9b56e..b800af91 100644 --- a/modules/prompts.py +++ b/modules/prompts.py @@ -22,8 +22,7 @@ def load_prompt(fname): if file_path.exists(): with open(file_path, 'r', encoding='utf-8') as f: text = f.read() - if len(text) > 0 and text[-1] == '\n': - text = text[:-1] + text = text.rstrip() return text else: diff --git a/modules/shared.py b/modules/shared.py index 644261a0..4daf43c9 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -31,7 +31,7 @@ persistent_interface_state = {} need_restart = False # Parser copied from https://github.com/vladmandic/automatic -parser = argparse.ArgumentParser(description="Text generation web UI", conflict_handler='resolve', add_help=True, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=55, indent_increment=2, width=200)) +parser = argparse.ArgumentParser(description="Text Generation Web UI", conflict_handler='resolve', add_help=True, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=55, indent_increment=2, width=200)) # Basic settings group = parser.add_argument_group('Basic settings') @@ -73,7 +73,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for # llama.cpp group = parser.add_argument_group('llama.cpp') -group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.') group.add_argument('--threads', type=int, default=0, help='Number of threads to use.') group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') @@ -159,9 +158,6 @@ group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 f group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API') group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.') -# Deprecated parameters -group = parser.add_argument_group('Deprecated') - # Handle CMD_FLAGS.txt cmd_flags_path = Path(__file__).parent.parent / "user_data" / "CMD_FLAGS.txt" if cmd_flags_path.exists(): @@ -203,7 +199,7 @@ settings = { 'start_with': '', 'mode': 'instruct', 'chat_style': 'cai-chat', - 'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>', + 'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>". Reply directly, without starting the reply with the character name.\n\n<|prompt|>', 'enable_web_search': False, 'web_search_pages': 3, 'prompt-notebook': '', @@ -287,7 +283,7 @@ settings = { 'greeting': 'How can I help you today?', 'custom_system_message': '', 'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}", - 'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}", + 'chat_template_str': "{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {%- if message['content'] -%}\n {{- message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- if user_bio -%}\n {{- user_bio + '\\n\\n' -}}\n {%- endif -%}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{- name1 + ': ' + message['content'] + '\\n'-}}\n {%- else -%}\n {{- name2 + ': ' + message['content'] + '\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n {{- name2 + ':' -}}\n{%- endif %}", # Extensions 'default_extensions': [], diff --git a/modules/ui.py b/modules/ui.py index 502005e7..12f43768 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -142,7 +142,6 @@ def list_model_elements(): 'num_experts_per_token', 'load_in_8bit', 'load_in_4bit', - 'flash_attn', 'attn_implementation', 'cpu', 'disk', diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 94c980bb..7c388607 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -78,21 +78,21 @@ def create_ui(): with gr.Row(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) - gr.HTML("
") + gr.HTML("") shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.') - shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.') + shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.') - gr.HTML("
") + gr.HTML("") shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search') with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']: shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10) - gr.HTML("
") + gr.HTML("") with gr.Row(): - shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') + shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='In instruct and chat-instruct modes, the template under Parameters > Instruction template is used.', elem_id='chat-mode') with gr.Row(): shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct') @@ -100,7 +100,7 @@ def create_ui(): with gr.Row(): shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar']) - gr.HTML("
") + gr.HTML("") with gr.Row(): shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm') diff --git a/modules/ui_default.py b/modules/ui_default.py index 44af48a3..c0feae19 100644 --- a/modules/ui_default.py +++ b/modules/ui_default.py @@ -22,8 +22,7 @@ def create_ui(): with gr.Row(): with gr.Column(): with gr.Row(): - initial_text = load_prompt(shared.settings['prompt-notebook']) - shared.gradio['textbox-default'] = gr.Textbox(value=initial_text, lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar']) + shared.gradio['textbox-default'] = gr.Textbox(value="", lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar']) shared.gradio['token-counter-default'] = gr.HTML(value="0", elem_id="default-token-counter") with gr.Row(): diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index dd240627..729700d4 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -50,7 +50,6 @@ def create_ui(): with gr.Column(): shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) - shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py index 939d81f7..9fab879b 100644 --- a/modules/ui_notebook.py +++ b/modules/ui_notebook.py @@ -30,8 +30,7 @@ def create_ui(): with gr.Column(scale=4): with gr.Tab('Raw'): with gr.Row(): - initial_text = load_prompt(shared.settings['prompt-notebook']) - shared.gradio['textbox-notebook'] = gr.Textbox(label="", value=initial_text, lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar']) + shared.gradio['textbox-notebook'] = gr.Textbox(label="", value="", lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar']) shared.gradio['token-counter-notebook'] = gr.HTML(value="0", elem_id="notebook-token-counter") with gr.Tab('Markdown'): diff --git a/modules/utils.py b/modules/utils.py index 4927ef04..e8d23a02 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -86,6 +86,19 @@ def check_model_loaded(): return True, None +def resolve_model_path(model_name_or_path): + """ + Resolves a model path, checking for a direct path + before the default models directory. + """ + + path_candidate = Path(model_name_or_path) + if path_candidate.exists(): + return path_candidate + else: + return Path(f'{shared.args.model_dir}/{model_name_or_path}') + + def get_available_models(): # Get all GGUF files gguf_files = get_available_ggufs() diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 9f906b26..3a3b899c 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" bitsandbytes==0.46.* colorama datasets @@ -34,8 +35,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 70e031b8..388da65c 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops @@ -33,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 81556326..d1635779 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops @@ -33,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 7b9d3650..dde8d4a1 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops @@ -33,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 0fc9162f..9b1776ca 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops @@ -33,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 3565a994..17d907bc 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops @@ -33,5 +34,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 64c17416..8c095428 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops @@ -33,5 +34,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 2b162308..553e8cfb 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" bitsandbytes==0.46.* colorama datasets @@ -34,8 +35,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.6/exllamav3-0.0.6+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index cd85a744..74d86047 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -1,4 +1,5 @@ accelerate==1.8.* +audioop-lts<1.0; python_version >= "3.13" colorama datasets einops diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 943ea600..e77ce7b1 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 @@ -18,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 394b89b6..dc45ef37 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 @@ -18,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index cffe3aea..541f96d4 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 @@ -18,6 +19,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index d274e2c8..2af3b4b9 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 @@ -18,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 47ec086e..6a5f5740 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 @@ -18,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 9a0a3694..a7f2405b 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 @@ -18,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index b7b73eff..be624bb1 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 45e96da9..bb2b0f28 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 @@ -18,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 9183562e..404f1267 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -1,3 +1,4 @@ +audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 gradio==4.37.* html2text==2025.4.15 @@ -18,5 +19,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.37.0/llama_cpp_binaries-0.37.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.42.0/llama_cpp_binaries-0.42.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/server.py b/server.py index 7ce3c208..c804c342 100644 --- a/server.py +++ b/server.py @@ -6,6 +6,7 @@ from pathlib import Path from modules import shared from modules.block_requests import OpenMonkeyPatch, RequestBlocker from modules.logging_colors import logger +from modules.prompts import load_prompt # Set up Gradio temp directory path gradio_temp_path = Path('user_data') / 'cache' / 'gradio' @@ -70,7 +71,7 @@ from modules.utils import gradio def signal_handler(sig, frame): - logger.info("Received Ctrl+C. Shutting down Text generation web UI gracefully.") + logger.info("Received Ctrl+C. Shutting down Text Generation Web UI gracefully.") # Explicitly stop LlamaServer to avoid __del__ cleanup issues during shutdown if shared.model and shared.model.__class__.__name__ == 'LlamaServer': @@ -87,7 +88,7 @@ signal.signal(signal.SIGINT, signal_handler) def create_interface(): - title = 'Text generation web UI' + title = 'Text Generation Web UI' # Password authentication auth = [] @@ -109,6 +110,13 @@ def create_interface(): 'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp' }) + if shared.settings['prompt-notebook']: + prompt = load_prompt(shared.settings['prompt-notebook']) + shared.persistent_interface_state.update({ + 'textbox-default': prompt, + 'textbox-notebook': prompt + }) + # Clear existing cache files for cache_file in ['pfp_character.png', 'pfp_character_thumb.png']: cache_path = Path(f"user_data/cache/{cache_file}") @@ -230,7 +238,7 @@ def create_interface(): if __name__ == "__main__": - logger.info("Starting Text generation web UI") + logger.info("Starting Text Generation Web UI") do_cmd_flags_warnings() # Load custom settings @@ -283,21 +291,14 @@ if __name__ == "__main__": # If any model has been selected, load it if shared.model_name != 'None': - p = Path(shared.model_name) - if p.exists(): - model_name = p.parts[-1] - shared.model_name = model_name - else: - model_name = shared.model_name - - model_settings = get_model_metadata(model_name) + model_settings = get_model_metadata(shared.model_name) update_model_parameters(model_settings, initial=True) # hijack the command-line arguments # Auto-adjust GPU layers if not provided by user and it's a llama.cpp model if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings: vram_usage, adjusted_layers = update_gpu_layers_and_vram( shared.args.loader, - model_name, + shared.model_name, model_settings['gpu_layers'], shared.args.ctx_size, shared.args.cache_type, @@ -308,7 +309,7 @@ if __name__ == "__main__": shared.args.gpu_layers = adjusted_layers # Load the model - shared.model, shared.tokenizer = load_model(model_name) + shared.model, shared.tokenizer = load_model(shared.model_name) if shared.args.lora: add_lora_to_model(shared.args.lora)