mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-03-25 06:44:39 +01:00
llama.cpp: Support literal flags in --extra-flags (e.g. --rpc, --jinja)
The old format is still accepted for backwards compatibility.
This commit is contained in:
parent
2a6b1fdcba
commit
7e54e7b7ae
|
|
@ -1,6 +1,7 @@
|
|||
import json
|
||||
import os
|
||||
import pprint
|
||||
import shlex
|
||||
import re
|
||||
import socket
|
||||
import subprocess
|
||||
|
|
@ -446,24 +447,28 @@ class LlamaServer:
|
|||
elif extra_flags.startswith("'") and extra_flags.endswith("'"):
|
||||
extra_flags = extra_flags[1:-1].strip()
|
||||
|
||||
# llama.cpp flags that only have a long form (--) despite being short
|
||||
long_form_only = {'rpc', 'fit', 'pos', 'ppl'}
|
||||
if extra_flags.startswith('-'):
|
||||
# New literal format: "--jinja --rpc 1222,1222"
|
||||
cmd += shlex.split(extra_flags)
|
||||
else:
|
||||
# Legacy format: "flag1=value1,flag2,flag3=value3"
|
||||
long_form_only = {'rpc', 'fit', 'pos', 'ppl'}
|
||||
|
||||
for flag_item in extra_flags.split(','):
|
||||
flag_item = flag_item.strip()
|
||||
if '=' in flag_item:
|
||||
flag, value = flag_item.split('=', 1)
|
||||
flag = flag.strip()
|
||||
value = value.strip()
|
||||
if len(flag) <= 3 and flag not in long_form_only:
|
||||
cmd += [f"-{flag}", value]
|
||||
for flag_item in extra_flags.split(','):
|
||||
flag_item = flag_item.strip()
|
||||
if '=' in flag_item:
|
||||
flag, value = flag_item.split('=', 1)
|
||||
flag = flag.strip()
|
||||
value = value.strip()
|
||||
if len(flag) <= 3 and flag not in long_form_only:
|
||||
cmd += [f"-{flag}", value]
|
||||
else:
|
||||
cmd += [f"--{flag}", value]
|
||||
else:
|
||||
cmd += [f"--{flag}", value]
|
||||
else:
|
||||
if len(flag_item) <= 3 and flag_item not in long_form_only:
|
||||
cmd.append(f"-{flag_item}")
|
||||
else:
|
||||
cmd.append(f"--{flag_item}")
|
||||
if len(flag_item) <= 3 and flag_item not in long_form_only:
|
||||
cmd.append(f"-{flag_item}")
|
||||
else:
|
||||
cmd.append(f"--{flag_item}")
|
||||
|
||||
env = os.environ.copy()
|
||||
if os.name == 'posix':
|
||||
|
|
|
|||
|
|
@ -109,7 +109,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
|
|||
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
|
||||
group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
|
||||
group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
|
||||
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"')
|
||||
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
|
||||
|
||||
# Transformers/Accelerate
|
||||
group = parser.add_argument_group('Transformers/Accelerate')
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ def create_ui():
|
|||
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
|
||||
shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
|
||||
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
|
||||
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
|
||||
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Extra flags to pass to llama-server. Example: --jinja --rpc 192.168.1.100:50052', value=shared.args.extra_flags)
|
||||
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
|
||||
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
|
||||
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
|
||||
|
|
|
|||
Loading…
Reference in a new issue