From b0baf7518bfd76d761b04ec7168e184089e18cab Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 19 Nov 2025 06:07:15 -0800
Subject: [PATCH 1/7] Remove macos x86-64 portable builds (macos-13 runner
deprecated by GitHub)
---
.github/workflows/build-portable-release.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index 51567ec8..a5b5b7c5 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -57,7 +57,7 @@ jobs:
id: set-matrix
run: |
$matrix = @{
- 'os' = @('ubuntu-22.04', 'windows-2022', 'macos-13', 'macos-14')
+ 'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
'pyver' = @("3.11")
'avx' = @("AVX2")
}
From 8f0048663d815bd9f4e253f5e87f9ea4d309caf9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 21 Nov 2025 07:09:16 -0800
Subject: [PATCH 2/7] More modular HTML generator
---
modules/html_generator.py | 109 ++++++++++++++++++++++++--------------
1 file changed, 69 insertions(+), 40 deletions(-)
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 492b52bd..ccc232a0 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -196,50 +196,45 @@ def extract_thinking_block(string):
return None, string
-@functools.lru_cache(maxsize=None)
-def convert_to_markdown(string, message_id=None):
- if not string:
+def build_thinking_block(thinking_content, message_id, has_remaining_content):
+ """Build HTML for a thinking block."""
+ if thinking_content is None:
+ return None
+
+ # Process the thinking content through markdown
+ thinking_html = process_markdown_content(thinking_content)
+
+ # Generate unique ID for the thinking block
+ block_id = f"thinking-{message_id}-0"
+
+ # Check if thinking is complete or still in progress
+ is_streaming = not has_remaining_content
+ title_text = "Thinking..." if is_streaming else "Thought"
+
+ return f'''
+
+
+ {thinking_html}
+
+ '''
+
+
+def build_main_content_block(content):
+ """Build HTML for the main content block."""
+ if not content:
return ""
- # Use a default message ID if none provided
- if message_id is None:
- message_id = "unknown"
-
- # Extract thinking block if present
- thinking_content, remaining_content = extract_thinking_block(string)
-
- # Process the main content
- html_output = process_markdown_content(remaining_content)
-
- # If thinking content was found, process it using the same function
- if thinking_content is not None:
- thinking_html = process_markdown_content(thinking_content)
-
- # Generate unique ID for the thinking block
- block_id = f"thinking-{message_id}-0"
-
- # Check if thinking is complete or still in progress
- is_streaming = not remaining_content
- title_text = "Thinking..." if is_streaming else "Thought"
-
- thinking_block = f'''
-
-
- {thinking_html}
-
- '''
-
- # Prepend the thinking block to the message HTML
- html_output = thinking_block + html_output
-
- return html_output
+ return process_markdown_content(content)
def process_markdown_content(string):
- """Process a string through the markdown conversion pipeline."""
+ """
+ Process a string through the markdown conversion pipeline.
+ Uses robust manual parsing to ensure correct LaTeX and Code Block rendering.
+ """
if not string:
return ""
@@ -280,7 +275,7 @@ def process_markdown_content(string):
pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
string = pattern.sub(replace_blockquote, string)
- # Code
+ # Code block standardization
string = string.replace('\\begin{code}', '```')
string = string.replace('\\end{code}', '```')
string = string.replace('\\begin{align*}', '$$')
@@ -301,6 +296,7 @@ def process_markdown_content(string):
is_code = False
is_latex = False
+ # Manual line iteration for robust structure parsing
for line in string.split('\n'):
stripped_line = line.strip()
@@ -371,6 +367,39 @@ def process_markdown_content(string):
return html_output
+@functools.lru_cache(maxsize=None)
+def convert_to_markdown(string, message_id=None):
+ """
+ Convert a string to markdown HTML with support for multiple block types.
+ Blocks are assembled in order: thinking, main content, etc.
+ """
+ if not string:
+ return ""
+
+ # Use a default message ID if none provided
+ if message_id is None:
+ message_id = "unknown"
+
+ # Extract different components from the string
+ thinking_content, remaining_content = extract_thinking_block(string)
+
+ # Build individual HTML blocks
+ blocks = []
+
+ # Add thinking block if present
+ thinking_html = build_thinking_block(thinking_content, message_id, bool(remaining_content))
+ if thinking_html:
+ blocks.append(thinking_html)
+
+ # Add main content block
+ main_html = build_main_content_block(remaining_content)
+ if main_html:
+ blocks.append(main_html)
+
+ # Assemble all blocks into final HTML
+ return ''.join(blocks)
+
+
def convert_to_markdown_wrapped(string, message_id=None, use_cache=True):
'''
Used to avoid caching convert_to_markdown calls during streaming.
From 400bb0694b148a8c099636e2d546c52df3614cfd Mon Sep 17 00:00:00 2001
From: GodEmperor785 <32663080+GodEmperor785@users.noreply.github.com>
Date: Fri, 21 Nov 2025 20:56:02 +0100
Subject: [PATCH 3/7] Add slider for --ubatch-size for llama.cpp loader, change
defaults for better MoE performance (#7316)
---
modules/llama_cpp_server.py | 1 +
modules/loaders.py | 1 +
modules/shared.py | 3 ++-
modules/ui.py | 1 +
modules/ui_model_menu.py | 1 +
5 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 47d9d27c..f83ed663 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -317,6 +317,7 @@ class LlamaServer:
"--ctx-size", str(shared.args.ctx_size),
"--gpu-layers", str(shared.args.gpu_layers),
"--batch-size", str(shared.args.batch_size),
+ "--ubatch-size", str(shared.args.ubatch_size),
"--port", str(self.port),
"--no-webui",
"--flash-attn", "on",
diff --git a/modules/loaders.py b/modules/loaders.py
index 0f0f6d1e..0bf3781b 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -10,6 +10,7 @@ loaders_and_params = OrderedDict({
'threads',
'threads_batch',
'batch_size',
+ 'ubatch_size',
'ctx_size',
'cache_type',
'tensor_split',
diff --git a/modules/shared.py b/modules/shared.py
index 1cca1233..289fda54 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -74,7 +74,8 @@ group.add_argument('--row-split', action='store_true', help='Split the model by
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
-group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
+group.add_argument('--batch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size')
+group.add_argument('--ubatch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the max physical batch size for computation (device level).')
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
diff --git a/modules/ui.py b/modules/ui.py
index d8dcedfb..f99e8b6a 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -129,6 +129,7 @@ def list_model_elements():
'threads',
'threads_batch',
'batch_size',
+ 'ubatch_size',
'ctx_size',
'cache_type',
'tensor_split',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 31ab929f..86adc229 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -84,6 +84,7 @@ def create_ui():
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
+ shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
From 3cad0cd4c189e98bea4c781b240a7cf9f4106dfc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 28 Nov 2025 03:52:37 -0800
Subject: [PATCH 4/7] Update llama.cpp
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_amd.txt | 4 ++--
requirements/full/requirements_amd_noavx2.txt | 4 ++--
requirements/full/requirements_apple_intel.txt | 4 ++--
requirements/full/requirements_apple_silicon.txt | 4 ++--
requirements/full/requirements_cpu_only.txt | 4 ++--
requirements/full/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements.txt | 4 ++--
requirements/portable/requirements_amd.txt | 4 ++--
requirements/portable/requirements_amd_noavx2.txt | 4 ++--
requirements/portable/requirements_apple_intel.txt | 4 ++--
requirements/portable/requirements_apple_silicon.txt | 4 ++--
requirements/portable/requirements_cpu_only.txt | 4 ++--
requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
requirements/portable/requirements_noavx2.txt | 4 ++--
requirements/portable/requirements_vulkan.txt | 4 ++--
requirements/portable/requirements_vulkan_noavx2.txt | 4 ++--
18 files changed, 36 insertions(+), 36 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 7075f955..67628396 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 39e4abd6..d9bf64cb 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -38,7 +38,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 00e2172c..2c06c22f 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -38,7 +38,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index dc1d1820..a4929d40 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -38,5 +38,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 0ccf95a5..6a7ce8a6 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -38,5 +38,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 357daa9f..05957700 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -38,5 +38,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 4e8db9e9..9e43c409 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -38,5 +38,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 2204a40e..bf3c0734 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -40,8 +40,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index a876e0b8..5f9b69e8 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 3dfa6d3f..10a6b5af 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt
index 57c6d552..4686b9de 100644
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+rocm6.4.4avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index f7e4e6e4..890568f4 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 99f55d33..e480db8f 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 008c20af..c2a7d040 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index d43ab758..e243d4be 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index c7f4703d..3db163b0 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index d339f97f..7758131c 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 1b4e3bd7..dcee8d05 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.61.0/llama_cpp_binaries-0.61.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
From 78b315344abca1863366219a1bbe792e91c817e1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 28 Nov 2025 06:45:05 -0800
Subject: [PATCH 5/7] Update exllamav3
---
requirements/full/requirements.txt | 4 ++--
requirements/full/requirements_noavx2.txt | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 67628396..4be87c56 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -42,8 +42,8 @@ tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index bf3c0734..9931e1dc 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -42,8 +42,8 @@ tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.62.0/llama_cpp_binaries-0.62.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.15/exllamav3-0.0.15+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.16/exllamav3-0.0.16+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
From 5327bc939705c0f65e77f8925ffd248fd3e66b88 Mon Sep 17 00:00:00 2001
From: oobabooga
Date: Fri, 28 Nov 2025 22:48:05 -0300
Subject: [PATCH 6/7] Update modules/shared.py
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
modules/shared.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/shared.py b/modules/shared.py
index 289fda54..134c0cac 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -74,7 +74,7 @@ group.add_argument('--row-split', action='store_true', help='Split the model by
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
-group.add_argument('--batch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size')
+group.add_argument('--batch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the application level batch size.')
group.add_argument('--ubatch-size', type=int, default=1024, help='Maximum number of prompt tokens to batch together when calling llama-server. This is the max physical batch size for computation (device level).')
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
From 661e42d2b7458a82ea0e98b1c732ba6a91dc4100 Mon Sep 17 00:00:00 2001
From: aidevtime
Date: Sat, 29 Nov 2025 09:59:36 +0800
Subject: [PATCH 7/7] fix(deps): upgrade coqui-tts to >=0.27.0 for transformers
4.55 compatibility (#7329)
---
extensions/coqui_tts/requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/extensions/coqui_tts/requirements.txt b/extensions/coqui_tts/requirements.txt
index b0b691e8..e7eb3691 100644
--- a/extensions/coqui_tts/requirements.txt
+++ b/extensions/coqui_tts/requirements.txt
@@ -1 +1 @@
-coqui-tts==0.25.1
+coqui-tts>=0.27.0