From 0466b6e2714a05c04eff0c929f15e4679f029e8d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 29 Mar 2026 15:52:36 -0700 Subject: [PATCH] ik_llama.cpp: Auto-enable Hadamard KV cache rotation with quantized cache --- modules/llama_cpp_server.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 5e2decfa..fa968be1 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -631,6 +631,12 @@ def _patch_cmd_for_ik(cmd): --cache-reuse → (removed, unsupported) --swa-full → (removed, unsupported) """ + # Add Hadamard KV cache rotation when using quantized cache types. + # This significantly improves quantized cache quality (especially q4_0) + # and is a no-op for MLA models like DeepSeek. + if shared.args.cache_type in ("q8_0", "q4_0"): + cmd += ["-khad", "-vhad"] + patched = [] i = 0 while i < len(cmd):