Revert "Attempt at making the llama-server streaming more efficient."

This reverts commit 5ad080ff25.
2026-04-10 00:53:39 +00:00 · 2025-04-18 18:13:54 -07:00 · 2025-04-18 18:13:54 -07:00 · 2002590536
commit 2002590536
parent 71ae05e0a4
10 changed files with 26 additions and 37 deletions
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -8,7 +8,6 @@ import time

 import llama_cpp_binaries
 import requests
-import sseclient

 from modules import shared
 from modules.logging_colors import logger
@ -140,43 +139,42 @@ class LlamaServer:
            pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
            print()

-        # Configure headers for Server-Sent Events
-        headers = {
-            'Content-Type': 'application/json',
-            'Accept': 'text/event-stream'
-        }
-
-        response = requests.post(url, json=payload, stream=True, headers=headers)
-        response.raise_for_status()
-
-        # Initialize SSE client for proper event stream parsing
-        client = sseclient.SSEClient(response)
+        # Make a direct request with streaming enabled
+        response = requests.post(url, json=payload, stream=True)
+        response.raise_for_status()  # Raise an exception for HTTP errors

        full_text = ""

-        for event in client.events():
+        # Process the streaming response
+        for line in response.iter_lines():
            if shared.stop_everything:
                break

-            try:
-                # Handle stream termination marker
-                if event.data == '[DONE]':
-                    break
+            if line:
+                try:
+                    # Check if the line starts with "data: " and remove it
+                    line_str = line.decode('utf-8')
+                    if line_str.startswith('data: '):
+                        line_str = line_str[6:]  # Remove the "data: " prefix

-                data = json.loads(event.data)
+                    # Parse the JSON data
+                    data = json.loads(line_str)

-                if 'content' in data:
-                    token_text = data['content']
-                    full_text += token_text
-                    yield full_text
+                    # Extract the token content
+                    if 'content' in data:
+                        token_text = data['content']
+                        full_text += token_text
+                        yield full_text

-                if data.get('stop', False):
-                    break
+                    # Check if generation is complete
+                    if data.get('stop', False):
+                        break

-            except json.JSONDecodeError as e:
-                print(f"JSON decode error: {e}")
-                print(f"Problematic data: {event.data}")
-                continue
+                except json.JSONDecodeError as e:
+                    # Log the error and the problematic line
+                    print(f"JSON decode error: {e}")
+                    print(f"Problematic line: {line}")
+                    continue

    def generate(self, prompt, state):
        output = ""
--- a/requirements.txt
+++ b/requirements.txt
@ -19,7 +19,6 @@ requests
 rich
 safetensors==0.5.*
 scipy
-sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@ -18,7 +18,6 @@ requests
 rich
 safetensors==0.5.*
 scipy
-sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@ -18,7 +18,6 @@ requests
 rich
 safetensors==0.5.*
 scipy
-sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@ -18,7 +18,6 @@ requests
 rich
 safetensors==0.5.*
 scipy
-sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@ -18,7 +18,6 @@ requests
 rich
 safetensors==0.5.*
 scipy
-sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@ -18,7 +18,6 @@ requests
 rich
 safetensors==0.5.*
 scipy
-sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@ -18,7 +18,6 @@ requests
 rich
 safetensors==0.5.*
 scipy
-sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@ -19,7 +19,6 @@ requests
 rich
 safetensors==0.5.*
 scipy
-sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@ -18,7 +18,6 @@ requests
 rich
 safetensors==0.5.*
 scipy
-sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*