From 8be798e15f48bd1f498d2c609ddf2f31cf22524b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 24 Aug 2025 12:19:19 -0700 Subject: [PATCH] llama.cpp: Fix stderr deadlock while loading some multimodal models --- modules/llama_cpp_server.py | 70 ++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 8f1924cb..e3dd43b4 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -410,8 +410,7 @@ class LlamaServer: self.process = subprocess.Popen( cmd, stderr=subprocess.PIPE, - text=True, - bufsize=1, + bufsize=0, env=env ) @@ -473,34 +472,55 @@ def filter_stderr_with_progress(process_stderr): last_was_progress = False try: - for raw in iter(process_stderr.readline, ''): - line = raw.rstrip('\r\n') - match = progress_re.search(line) + # Read in binary mode and decode manually + buffer = b"" + while True: + # Read chunks aggressively to prevent buffer overflow + chunk = process_stderr.read(4096) + if not chunk: + break - if match: - progress = float(match.group(1)) + buffer += chunk - # Extract just the part from "prompt processing" onwards - prompt_processing_idx = line.find('prompt processing') - if prompt_processing_idx != -1: - display_line = line[prompt_processing_idx:] - else: - display_line = line # fallback to full line + # Process complete lines + while b'\n' in buffer: + line_bytes, buffer = buffer.split(b'\n', 1) + try: + line = line_bytes.decode('utf-8', errors='replace').strip('\r\n') + if line: # Process non-empty lines + match = progress_re.search(line) - # choose carriage return for in-progress or newline at completion - end_char = '\r' if progress < 1.0 else '\n' - print(display_line, end=end_char, file=sys.stderr, flush=True) - last_was_progress = (progress < 1.0) + if match: + progress = float(match.group(1)) - # skip noise lines - elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line): - # if we were in progress, finish that line first - if last_was_progress: - print(file=sys.stderr) + # Extract just the part from "prompt processing" onwards + prompt_processing_idx = line.find('prompt processing') + if prompt_processing_idx != -1: + display_line = line[prompt_processing_idx:] + else: + display_line = line # fallback to full line - print(line, file=sys.stderr, flush=True) - last_was_progress = False + # choose carriage return for in-progress or newline at completion + end_char = '\r' if progress < 1.0 else '\n' + print(display_line, end=end_char, file=sys.stderr, flush=True) + last_was_progress = (progress < 1.0) + + # skip noise lines + elif not (line.startswith(('srv ', 'slot ')) or 'log_server_r: request: GET /health' in line): + # if we were in progress, finish that line first + if last_was_progress: + print(file=sys.stderr) + + print(line, file=sys.stderr, flush=True) + last_was_progress = False + + except Exception: + continue except (ValueError, IOError): - # silently ignore broken output or IO errors pass + finally: + try: + process_stderr.close() + except: + pass