Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669)

This commit is contained in:
oobabooga 2024-03-09 00:25:33 -03:00 committed by GitHub
parent 9271e80914
commit afb51bd5d6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 147 additions and 0 deletions

View file

@ -13,6 +13,7 @@ import transformers
from transformers import LogitsProcessorList, is_torch_xpu_available
import modules.shared as shared
from modules.cache_utils import process_llamacpp_cache
from modules.callbacks import (
Iteratorize,
Stream,
@ -364,6 +365,12 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
print(decode(input_ids[0], skip_special_tokens=False))
print()
# Handle StreamingLLM for llamacpp_HF
if shared.model.__class__.__name__ == 'LlamacppHF' and shared.args.streaming_llm:
tmp = process_llamacpp_cache(shared.model.model, input_ids[-1].tolist(), shared.model.model._input_ids)
shared.model.past_seq = torch.tensor(tmp)
shared.model.save_cache()
t0 = time.time()
try:
if not is_chat and not shared.is_seq2seq: