ik_llama.cpp: Auto-enable Hadamard KV cache rotation with quantized cache

2026-04-06 15:13:38 +00:00 · 2026-03-29 15:52:36 -07:00 · 2026-03-29 15:52:36 -07:00 · 0466b6e271
commit 0466b6e271
parent be6fc0663a
1 changed files with 6 additions and 0 deletions
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -631,6 +631,12 @@ def _patch_cmd_for_ik(cmd):
      --cache-reuse        → (removed, unsupported)
      --swa-full           → (removed, unsupported)
    """
+    # Add Hadamard KV cache rotation when using quantized cache types.
+    # This significantly improves quantized cache quality (especially q4_0)
+    # and is a no-op for MLA models like DeepSeek.
+    if shared.args.cache_type in ("q8_0", "q4_0"):
+        cmd += ["-khad", "-vhad"]
+
    patched = []
    i = 0
    while i < len(cmd):