diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 2ae01ddc..9b9756a9 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -470,6 +470,10 @@ class LlamaServer: else: cmd.append(f"--{flag_item}") + # Patch flags for ik_llama.cpp compatibility + if shared.args.ik: + cmd = _patch_cmd_for_ik(cmd) + env = os.environ.copy() if os.name == 'posix': current_path = env.get('LD_LIBRARY_PATH', '') @@ -607,3 +611,36 @@ def filter_stderr_with_progress(process_stderr): process_stderr.close() except Exception: pass + + +def _patch_cmd_for_ik(cmd): + """ + Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents: + --no-webui → --webui none + --fit off → (removed) + --fit on / --fit-ctx → --fit (bare flag) + --fit-target → --fit-margin + """ + patched = [] + i = 0 + while i < len(cmd): + arg = cmd[i] + + if arg == "--no-webui": + patched += ["--webui", "none"] + elif arg == "--fit" and i + 1 < len(cmd) and cmd[i + 1] in ("on", "off"): + val = cmd[i + 1] + i += 1 + if val == "on": + patched.append("--fit") + # "off" → drop entirely + elif arg == "--fit-ctx": + i += 1 # skip the value + elif arg == "--fit-target": + patched.append("--fit-margin") + else: + patched.append(arg) + + i += 1 + + return patched diff --git a/modules/shared.py b/modules/shared.py index acb103b4..c50736d7 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -110,6 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.') group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.') group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"') +group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside /lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate')