mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-04-05 14:45:28 +00:00
API: Add parallel request support for llama.cpp and ExLlamaV3
This commit is contained in:
parent
2f08dce7b0
commit
9824c82cb6
10 changed files with 198 additions and 63 deletions
|
|
@ -217,8 +217,9 @@ class LlamaServer:
|
|||
full_text = ""
|
||||
|
||||
# Process the streaming response
|
||||
stop_event = state.get('stop_event')
|
||||
for line in response.iter_lines():
|
||||
if shared.stop_everything:
|
||||
if shared.stop_everything or (stop_event and stop_event.is_set()):
|
||||
break
|
||||
|
||||
if not line:
|
||||
|
|
@ -410,6 +411,7 @@ class LlamaServer:
|
|||
cmd += ["--spec-ngram-size-n", str(shared.args.spec_ngram_size_n)]
|
||||
cmd += ["--spec-ngram-size-m", str(shared.args.spec_ngram_size_m)]
|
||||
cmd += ["--spec-ngram-min-hits", str(shared.args.spec_ngram_min_hits)]
|
||||
cmd += ["--parallel", str(shared.args.parallel)]
|
||||
if shared.args.streaming_llm:
|
||||
cmd += ["--cache-reuse", "1"]
|
||||
cmd += ["--swa-full"]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue