From b9da5c7e3a34ef1a5ae2e372db7ff2c5299d523a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 19 Apr 2025 17:36:04 -0700 Subject: [PATCH] Use 127.0.0.1 instead of localhost for faster llama.cpp on Windows --- modules/llama_cpp_server.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 9c97e00b..ebce987a 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -37,7 +37,7 @@ class LlamaServer: if self.bos_token and text.startswith(self.bos_token): add_bos_token = False - url = f"http://localhost:{self.port}/tokenize" + url = f"http://127.0.0.1:{self.port}/tokenize" payload = { "content": text, "add_special": add_bos_token, @@ -48,7 +48,7 @@ class LlamaServer: return result.get("tokens", []) def decode(self, token_ids, **kwargs): - url = f"http://localhost:{self.port}/detokenize" + url = f"http://127.0.0.1:{self.port}/detokenize" payload = { "tokens": token_ids, } @@ -121,7 +121,7 @@ class LlamaServer: return payload def generate_with_streaming(self, prompt, state): - url = f"http://localhost:{self.port}/completion" + url = f"http://127.0.0.1:{self.port}/completion" payload = self.prepare_payload(state) token_ids = self.encode(prompt, add_bos_token=state["add_bos_token"]) @@ -194,7 +194,7 @@ class LlamaServer: def get_logits(self, prompt, state, n_probs=128, use_samplers=False): """Get the logits/probabilities for the next token after a prompt""" - url = f"http://localhost:{self.port}/completion" + url = f"http://127.0.0.1:{self.port}/completion" payload = self.prepare_payload(state) payload.update({ @@ -225,7 +225,7 @@ class LlamaServer: def _get_vocabulary_size(self): """Get and store the model's maximum context length.""" - url = f"http://localhost:{self.port}/v1/models" + url = f"http://127.0.0.1:{self.port}/v1/models" response = self.session.get(url).json() if "data" in response and len(response["data"]) > 0: @@ -235,7 +235,7 @@ class LlamaServer: def _get_bos_token(self): """Get and store the model's BOS token.""" - url = f"http://localhost:{self.port}/props" + url = f"http://127.0.0.1:{self.port}/props" response = self.session.get(url).json() if "bos_token" in response: self.bos_token = response["bos_token"] @@ -308,7 +308,7 @@ class LlamaServer: threading.Thread(target=filter_stderr, args=(self.process.stderr,), daemon=True).start() # Wait for server to be healthy - health_url = f"http://localhost:{self.port}/health" + health_url = f"http://127.0.0.1:{self.port}/health" start_time = time.time() timeout = 3600 * 8 # 8 hours while time.time() - start_time < timeout: