mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-04-04 14:17:28 +00:00
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669)
This commit is contained in:
parent
9271e80914
commit
afb51bd5d6
7 changed files with 147 additions and 0 deletions
|
|
@ -13,6 +13,7 @@ import transformers
|
|||
from transformers import LogitsProcessorList, is_torch_xpu_available
|
||||
|
||||
import modules.shared as shared
|
||||
from modules.cache_utils import process_llamacpp_cache
|
||||
from modules.callbacks import (
|
||||
Iteratorize,
|
||||
Stream,
|
||||
|
|
@ -364,6 +365,12 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
|
|||
print(decode(input_ids[0], skip_special_tokens=False))
|
||||
print()
|
||||
|
||||
# Handle StreamingLLM for llamacpp_HF
|
||||
if shared.model.__class__.__name__ == 'LlamacppHF' and shared.args.streaming_llm:
|
||||
tmp = process_llamacpp_cache(shared.model.model, input_ids[-1].tolist(), shared.model.model._input_ids)
|
||||
shared.model.past_seq = torch.tensor(tmp)
|
||||
shared.model.save_cache()
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
if not is_chat and not shared.is_seq2seq:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue