Add adaptive-p sampler and n-gram speculative decoding support

2026-04-20 22:13:43 +00:00 · 2026-03-04 09:41:29 -08:00 · 2026-03-04 09:41:29 -08:00 · 65de4c30c8
commit 65de4c30c8
parent f010aa1612
10 changed files with 145 additions and 3 deletions
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -75,6 +75,8 @@ class LlamaServer:
            "top_p": state["top_p"],
            "min_p": state["min_p"],
            "top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1,
+            "adaptive_target": state["adaptive_target"] if state["adaptive_target"] > 0 else -1,
+            "adaptive_decay": state["adaptive_decay"],
            "typical_p": state["typical_p"],
            "repeat_penalty": state["repetition_penalty"],
            "repeat_last_n": state["repetition_penalty_range"],
@ -123,6 +125,12 @@ class LlamaServer:
                filtered_samplers.remove("temperature")
                filtered_samplers.append("temperature")

+            # adaptive-p replaces the default dist sampler; llama.cpp always
+            # places it at the end of the chain regardless of position, so we
+            # activate it based on the parameter value rather than sampler order.
+            if state.get("adaptive_target", 0) > 0:
+                filtered_samplers.append("adaptive-p")
+
            payload["samplers"] = filtered_samplers

        if state['custom_token_bans']:
@ -391,6 +399,16 @@ class LlamaServer:
                cmd += ["--device-draft", shared.args.device_draft]
            if shared.args.ctx_size_draft > 0:
                cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
+        if shared.args.spec_type != 'none':
+            cmd += ["--spec-type", shared.args.spec_type]
+            if shared.args.draft_max > 0:
+                cmd += ["--draft-max", str(shared.args.draft_max)]
+            if shared.args.spec_ngram_size_n != 12:
+                cmd += ["--spec-ngram-size-n", str(shared.args.spec_ngram_size_n)]
+            if shared.args.spec_ngram_size_m != 48:
+                cmd += ["--spec-ngram-size-m", str(shared.args.spec_ngram_size_m)]
+            if shared.args.spec_ngram_min_hits != 1:
+                cmd += ["--spec-ngram-min-hits", str(shared.args.spec_ngram_min_hits)]
        if shared.args.streaming_llm:
            cmd += ["--cache-reuse", "1"]
            cmd += ["--swa-full"]