Revert "Attempt at making the llama-server streaming more efficient."

This reverts commit 5ad080ff25.
This commit is contained in:
oobabooga 2025-04-18 18:13:54 -07:00
parent 71ae05e0a4
commit 2002590536
10 changed files with 26 additions and 37 deletions

View file

@ -8,7 +8,6 @@ import time
import llama_cpp_binaries
import requests
import sseclient
from modules import shared
from modules.logging_colors import logger
@ -140,43 +139,42 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print()
# Configure headers for Server-Sent Events
headers = {
'Content-Type': 'application/json',
'Accept': 'text/event-stream'
}
response = requests.post(url, json=payload, stream=True, headers=headers)
response.raise_for_status()
# Initialize SSE client for proper event stream parsing
client = sseclient.SSEClient(response)
# Make a direct request with streaming enabled
response = requests.post(url, json=payload, stream=True)
response.raise_for_status() # Raise an exception for HTTP errors
full_text = ""
for event in client.events():
# Process the streaming response
for line in response.iter_lines():
if shared.stop_everything:
break
try:
# Handle stream termination marker
if event.data == '[DONE]':
break
if line:
try:
# Check if the line starts with "data: " and remove it
line_str = line.decode('utf-8')
if line_str.startswith('data: '):
line_str = line_str[6:] # Remove the "data: " prefix
data = json.loads(event.data)
# Parse the JSON data
data = json.loads(line_str)
if 'content' in data:
token_text = data['content']
full_text += token_text
yield full_text
# Extract the token content
if 'content' in data:
token_text = data['content']
full_text += token_text
yield full_text
if data.get('stop', False):
break
# Check if generation is complete
if data.get('stop', False):
break
except json.JSONDecodeError as e:
print(f"JSON decode error: {e}")
print(f"Problematic data: {event.data}")
continue
except json.JSONDecodeError as e:
# Log the error and the problematic line
print(f"JSON decode error: {e}")
print(f"Problematic line: {line}")
continue
def generate(self, prompt, state):
output = ""

View file

@ -19,7 +19,6 @@ requests
rich
safetensors==0.5.*
scipy
sseclient-py==1.8.0
sentencepiece
tensorboard
transformers==4.50.*

View file

@ -18,7 +18,6 @@ requests
rich
safetensors==0.5.*
scipy
sseclient-py==1.8.0
sentencepiece
tensorboard
transformers==4.50.*

View file

@ -18,7 +18,6 @@ requests
rich
safetensors==0.5.*
scipy
sseclient-py==1.8.0
sentencepiece
tensorboard
transformers==4.50.*

View file

@ -18,7 +18,6 @@ requests
rich
safetensors==0.5.*
scipy
sseclient-py==1.8.0
sentencepiece
tensorboard
transformers==4.50.*

View file

@ -18,7 +18,6 @@ requests
rich
safetensors==0.5.*
scipy
sseclient-py==1.8.0
sentencepiece
tensorboard
transformers==4.50.*

View file

@ -18,7 +18,6 @@ requests
rich
safetensors==0.5.*
scipy
sseclient-py==1.8.0
sentencepiece
tensorboard
transformers==4.50.*

View file

@ -18,7 +18,6 @@ requests
rich
safetensors==0.5.*
scipy
sseclient-py==1.8.0
sentencepiece
tensorboard
transformers==4.50.*

View file

@ -19,7 +19,6 @@ requests
rich
safetensors==0.5.*
scipy
sseclient-py==1.8.0
sentencepiece
tensorboard
transformers==4.50.*

View file

@ -18,7 +18,6 @@ requests
rich
safetensors==0.5.*
scipy
sseclient-py==1.8.0
sentencepiece
tensorboard
transformers==4.50.*