llama.cpp: Support literal flags in --extra-flags (e.g. --rpc, --jinja)

The old format is still accepted for backwards compatibility.
This commit is contained in:
oobabooga 2026-03-17 19:47:55 -07:00
parent 2a6b1fdcba
commit 7e54e7b7ae
3 changed files with 23 additions and 18 deletions

View file

@ -1,6 +1,7 @@
import json
import os
import pprint
import shlex
import re
import socket
import subprocess
@ -446,24 +447,28 @@ class LlamaServer:
elif extra_flags.startswith("'") and extra_flags.endswith("'"):
extra_flags = extra_flags[1:-1].strip()
# llama.cpp flags that only have a long form (--) despite being short
long_form_only = {'rpc', 'fit', 'pos', 'ppl'}
if extra_flags.startswith('-'):
# New literal format: "--jinja --rpc 1222,1222"
cmd += shlex.split(extra_flags)
else:
# Legacy format: "flag1=value1,flag2,flag3=value3"
long_form_only = {'rpc', 'fit', 'pos', 'ppl'}
for flag_item in extra_flags.split(','):
flag_item = flag_item.strip()
if '=' in flag_item:
flag, value = flag_item.split('=', 1)
flag = flag.strip()
value = value.strip()
if len(flag) <= 3 and flag not in long_form_only:
cmd += [f"-{flag}", value]
for flag_item in extra_flags.split(','):
flag_item = flag_item.strip()
if '=' in flag_item:
flag, value = flag_item.split('=', 1)
flag = flag.strip()
value = value.strip()
if len(flag) <= 3 and flag not in long_form_only:
cmd += [f"-{flag}", value]
else:
cmd += [f"--{flag}", value]
else:
cmd += [f"--{flag}", value]
else:
if len(flag_item) <= 3 and flag_item not in long_form_only:
cmd.append(f"-{flag_item}")
else:
cmd.append(f"--{flag_item}")
if len(flag_item) <= 3 and flag_item not in long_form_only:
cmd.append(f"-{flag_item}")
else:
cmd.append(f"--{flag_item}")
env = os.environ.copy()
if os.name == 'posix':

View file

@ -109,7 +109,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"')
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')

View file

@ -98,7 +98,7 @@ def create_ui():
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Extra flags to pass to llama-server. Example: --jinja --rpc 192.168.1.100:50052', value=shared.args.extra_flags)
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')