From e2e90af6cd5e839830cf1a75218777ea9c01bc53 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 18 Apr 2025 20:51:18 -0700 Subject: [PATCH 1/3] llama.cpp: don't include --rope-freq-base in the launch command if null --- modules/llama_cpp_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 5986ac36..5071c40c 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -250,7 +250,6 @@ class LlamaServer: "--ctx-size", str(shared.args.n_ctx), "--n-gpu-layers", str(shared.args.n_gpu_layers), "--batch-size", str(shared.args.batch_size), - "--rope-freq-base", str(shared.args.rope_freq_base), "--port", str(self.port), ] @@ -276,6 +275,8 @@ class LlamaServer: cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type] if shared.args.compress_pos_emb != 1: cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)] + if shared.args.rope_freq_base > 0: + cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)] # Start the server with pipes for output self.process = subprocess.Popen( From e2e73ed22f348092bb8915c29d2e2d68c6c8ec94 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 18 Apr 2025 21:04:56 -0700 Subject: [PATCH 2/3] llama.cpp: new optimization attempt --- modules/llama_cpp_server.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 5071c40c..b0fe1154 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -10,6 +10,7 @@ import llama_cpp_binaries import requests from modules import shared +from modules.callbacks import Iteratorize from modules.logging_colors import logger llamacpp_valid_cache_types = {"fp16", "q8_0", "q4_0"} @@ -118,7 +119,7 @@ class LlamaServer: return payload - def generate_with_streaming(self, prompt, state): + def generate(self, prompt, state, callback=None): url = f"http://localhost:{self.port}/completion" payload = self.prepare_payload(state) @@ -144,7 +145,7 @@ class LlamaServer: with self.session.post(url, json=payload, stream=True) as response: response.raise_for_status() # Raise an exception for HTTP errors - full_text = "" + output = "" # Process the streaming response for line in response.iter_lines(decode_unicode=True): @@ -162,9 +163,10 @@ class LlamaServer: # Extract the token content if 'content' in data: - token_text = data['content'] - full_text += token_text - yield full_text + text = data['content'] + output += text + if callback: + callback(output) # Check if generation is complete if data.get('stop', False): @@ -176,13 +178,13 @@ class LlamaServer: print(f"Problematic line: {line}") continue - def generate(self, prompt, state): - output = "" - for output in self.generate_with_streaming(prompt, state): - pass - return output + def generate_with_streaming(self, *args, **kwargs): + with Iteratorize(self.generate, args, kwargs, callback=None) as generator: + for output in generator: + yield output + def get_logits(self, prompt, state, n_probs=128, use_samplers=False): """Get the logits/probabilities for the next token after a prompt""" url = f"http://localhost:{self.port}/completion" From 6589ebeca8012a991ef6a39c90fbd5e588f0c6dd Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 18 Apr 2025 21:16:21 -0700 Subject: [PATCH 3/3] Revert "llama.cpp: new optimization attempt" This reverts commit e2e73ed22f348092bb8915c29d2e2d68c6c8ec94. --- modules/llama_cpp_server.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index b0fe1154..5071c40c 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -10,7 +10,6 @@ import llama_cpp_binaries import requests from modules import shared -from modules.callbacks import Iteratorize from modules.logging_colors import logger llamacpp_valid_cache_types = {"fp16", "q8_0", "q4_0"} @@ -119,7 +118,7 @@ class LlamaServer: return payload - def generate(self, prompt, state, callback=None): + def generate_with_streaming(self, prompt, state): url = f"http://localhost:{self.port}/completion" payload = self.prepare_payload(state) @@ -145,7 +144,7 @@ class LlamaServer: with self.session.post(url, json=payload, stream=True) as response: response.raise_for_status() # Raise an exception for HTTP errors - output = "" + full_text = "" # Process the streaming response for line in response.iter_lines(decode_unicode=True): @@ -163,10 +162,9 @@ class LlamaServer: # Extract the token content if 'content' in data: - text = data['content'] - output += text - if callback: - callback(output) + token_text = data['content'] + full_text += token_text + yield full_text # Check if generation is complete if data.get('stop', False): @@ -178,12 +176,12 @@ class LlamaServer: print(f"Problematic line: {line}") continue - return output + def generate(self, prompt, state): + output = "" + for output in self.generate_with_streaming(prompt, state): + pass - def generate_with_streaming(self, *args, **kwargs): - with Iteratorize(self.generate, args, kwargs, callback=None) as generator: - for output in generator: - yield output + return output def get_logits(self, prompt, state, n_probs=128, use_samplers=False): """Get the logits/probabilities for the next token after a prompt"""