From 2002590536ad0825aa677366240a649ab871b70f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:13:54 -0700 Subject: [PATCH] Revert "Attempt at making the llama-server streaming more efficient." This reverts commit 5ad080ff25a827c9491e085a45bf63c4b373a75c. --- modules/llama_cpp_server.py | 54 +++++++++++++++----------------- requirements.txt | 1 - requirements_amd.txt | 1 - requirements_amd_noavx2.txt | 1 - requirements_apple_intel.txt | 1 - requirements_apple_silicon.txt | 1 - requirements_cpu_only.txt | 1 - requirements_cpu_only_noavx2.txt | 1 - requirements_noavx2.txt | 1 - requirements_nowheels.txt | 1 - 10 files changed, 26 insertions(+), 37 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index d9b35baa..26ab8f10 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -8,7 +8,6 @@ import time import llama_cpp_binaries import requests -import sseclient from modules import shared from modules.logging_colors import logger @@ -140,43 +139,42 @@ class LlamaServer: pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() - # Configure headers for Server-Sent Events - headers = { - 'Content-Type': 'application/json', - 'Accept': 'text/event-stream' - } - - response = requests.post(url, json=payload, stream=True, headers=headers) - response.raise_for_status() - - # Initialize SSE client for proper event stream parsing - client = sseclient.SSEClient(response) + # Make a direct request with streaming enabled + response = requests.post(url, json=payload, stream=True) + response.raise_for_status() # Raise an exception for HTTP errors full_text = "" - for event in client.events(): + # Process the streaming response + for line in response.iter_lines(): if shared.stop_everything: break - try: - # Handle stream termination marker - if event.data == '[DONE]': - break + if line: + try: + # Check if the line starts with "data: " and remove it + line_str = line.decode('utf-8') + if line_str.startswith('data: '): + line_str = line_str[6:] # Remove the "data: " prefix - data = json.loads(event.data) + # Parse the JSON data + data = json.loads(line_str) - if 'content' in data: - token_text = data['content'] - full_text += token_text - yield full_text + # Extract the token content + if 'content' in data: + token_text = data['content'] + full_text += token_text + yield full_text - if data.get('stop', False): - break + # Check if generation is complete + if data.get('stop', False): + break - except json.JSONDecodeError as e: - print(f"JSON decode error: {e}") - print(f"Problematic data: {event.data}") - continue + except json.JSONDecodeError as e: + # Log the error and the problematic line + print(f"JSON decode error: {e}") + print(f"Problematic line: {line}") + continue def generate(self, prompt, state): output = "" diff --git a/requirements.txt b/requirements.txt index b6759806..607efda0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,6 @@ requests rich safetensors==0.5.* scipy -sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_amd.txt b/requirements_amd.txt index e156bc55..b242d4ad 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -18,7 +18,6 @@ requests rich safetensors==0.5.* scipy -sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index 6becd514..b6105209 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -18,7 +18,6 @@ requests rich safetensors==0.5.* scipy -sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt index 1223b4d3..ce730f63 100644 --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -18,7 +18,6 @@ requests rich safetensors==0.5.* scipy -sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt index f47f9991..a7be282d 100644 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -18,7 +18,6 @@ requests rich safetensors==0.5.* scipy -sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index 007f9ef1..2437c2ae 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -18,7 +18,6 @@ requests rich safetensors==0.5.* scipy -sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index 4219273d..cbaa8e96 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -18,7 +18,6 @@ requests rich safetensors==0.5.* scipy -sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index 58e0e5a1..cce27aa2 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -19,7 +19,6 @@ requests rich safetensors==0.5.* scipy -sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt index c210f6a0..3b61ca39 100644 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -18,7 +18,6 @@ requests rich safetensors==0.5.* scipy -sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.*