Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669)

2026-04-04 14:17:28 +00:00 · 2024-03-09 00:25:33 -03:00 · 2024-03-09 00:25:33 -03:00 · afb51bd5d6
commit afb51bd5d6
parent 9271e80914
7 changed files with 147 additions and 0 deletions
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -13,6 +13,7 @@ import transformers
 from transformers import LogitsProcessorList, is_torch_xpu_available

 import modules.shared as shared
+from modules.cache_utils import process_llamacpp_cache
 from modules.callbacks import (
    Iteratorize,
    Stream,
@ -364,6 +365,12 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
        print(decode(input_ids[0], skip_special_tokens=False))
        print()

+    # Handle StreamingLLM for llamacpp_HF
+    if shared.model.__class__.__name__ == 'LlamacppHF' and shared.args.streaming_llm:
+        tmp = process_llamacpp_cache(shared.model.model, input_ids[-1].tolist(), shared.model.model._input_ids)
+        shared.model.past_seq = torch.tensor(tmp)
+        shared.model.save_cache()
+
    t0 = time.time()
    try:
        if not is_chat and not shared.is_seq2seq: