From 7e54e7b7ae62b227fbd896b2daf704db1658baa5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 17 Mar 2026 19:47:55 -0700 Subject: [PATCH] llama.cpp: Support literal flags in `--extra-flags` (e.g. `--rpc`, `--jinja`) The old format is still accepted for backwards compatibility. --- modules/llama_cpp_server.py | 37 +++++++++++++++++++++---------------- modules/shared.py | 2 +- modules/ui_model_menu.py | 2 +- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 321a6d75..6dd36b2a 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -1,6 +1,7 @@ import json import os import pprint +import shlex import re import socket import subprocess @@ -446,24 +447,28 @@ class LlamaServer: elif extra_flags.startswith("'") and extra_flags.endswith("'"): extra_flags = extra_flags[1:-1].strip() - # llama.cpp flags that only have a long form (--) despite being short - long_form_only = {'rpc', 'fit', 'pos', 'ppl'} + if extra_flags.startswith('-'): + # New literal format: "--jinja --rpc 1222,1222" + cmd += shlex.split(extra_flags) + else: + # Legacy format: "flag1=value1,flag2,flag3=value3" + long_form_only = {'rpc', 'fit', 'pos', 'ppl'} - for flag_item in extra_flags.split(','): - flag_item = flag_item.strip() - if '=' in flag_item: - flag, value = flag_item.split('=', 1) - flag = flag.strip() - value = value.strip() - if len(flag) <= 3 and flag not in long_form_only: - cmd += [f"-{flag}", value] + for flag_item in extra_flags.split(','): + flag_item = flag_item.strip() + if '=' in flag_item: + flag, value = flag_item.split('=', 1) + flag = flag.strip() + value = value.strip() + if len(flag) <= 3 and flag not in long_form_only: + cmd += [f"-{flag}", value] + else: + cmd += [f"--{flag}", value] else: - cmd += [f"--{flag}", value] - else: - if len(flag_item) <= 3 and flag_item not in long_form_only: - cmd.append(f"-{flag_item}") - else: - cmd.append(f"--{flag_item}") + if len(flag_item) <= 3 and flag_item not in long_form_only: + cmd.append(f"-{flag_item}") + else: + cmd.append(f"--{flag_item}") env = os.environ.copy() if os.name == 'posix': diff --git a/modules/shared.py b/modules/shared.py index 486f376f..2382e714 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -109,7 +109,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.') group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.') -group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"') +group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index cb2052a4..6d8baff1 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -98,7 +98,7 @@ def create_ui(): shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size) shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') - shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags) + shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Extra flags to pass to llama-server. Example: --jinja --rpc 192.168.1.100:50052', value=shared.args.extra_flags) shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory) shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.') shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')