mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-01-16 21:51:21 +01:00
Use 127.0.0.1 instead of localhost for faster llama.cpp on Windows
This commit is contained in:
parent
9c9df2063f
commit
b9da5c7e3a
|
|
@ -37,7 +37,7 @@ class LlamaServer:
|
|||
if self.bos_token and text.startswith(self.bos_token):
|
||||
add_bos_token = False
|
||||
|
||||
url = f"http://localhost:{self.port}/tokenize"
|
||||
url = f"http://127.0.0.1:{self.port}/tokenize"
|
||||
payload = {
|
||||
"content": text,
|
||||
"add_special": add_bos_token,
|
||||
|
|
@ -48,7 +48,7 @@ class LlamaServer:
|
|||
return result.get("tokens", [])
|
||||
|
||||
def decode(self, token_ids, **kwargs):
|
||||
url = f"http://localhost:{self.port}/detokenize"
|
||||
url = f"http://127.0.0.1:{self.port}/detokenize"
|
||||
payload = {
|
||||
"tokens": token_ids,
|
||||
}
|
||||
|
|
@ -121,7 +121,7 @@ class LlamaServer:
|
|||
return payload
|
||||
|
||||
def generate_with_streaming(self, prompt, state):
|
||||
url = f"http://localhost:{self.port}/completion"
|
||||
url = f"http://127.0.0.1:{self.port}/completion"
|
||||
payload = self.prepare_payload(state)
|
||||
|
||||
token_ids = self.encode(prompt, add_bos_token=state["add_bos_token"])
|
||||
|
|
@ -194,7 +194,7 @@ class LlamaServer:
|
|||
|
||||
def get_logits(self, prompt, state, n_probs=128, use_samplers=False):
|
||||
"""Get the logits/probabilities for the next token after a prompt"""
|
||||
url = f"http://localhost:{self.port}/completion"
|
||||
url = f"http://127.0.0.1:{self.port}/completion"
|
||||
|
||||
payload = self.prepare_payload(state)
|
||||
payload.update({
|
||||
|
|
@ -225,7 +225,7 @@ class LlamaServer:
|
|||
|
||||
def _get_vocabulary_size(self):
|
||||
"""Get and store the model's maximum context length."""
|
||||
url = f"http://localhost:{self.port}/v1/models"
|
||||
url = f"http://127.0.0.1:{self.port}/v1/models"
|
||||
response = self.session.get(url).json()
|
||||
|
||||
if "data" in response and len(response["data"]) > 0:
|
||||
|
|
@ -235,7 +235,7 @@ class LlamaServer:
|
|||
|
||||
def _get_bos_token(self):
|
||||
"""Get and store the model's BOS token."""
|
||||
url = f"http://localhost:{self.port}/props"
|
||||
url = f"http://127.0.0.1:{self.port}/props"
|
||||
response = self.session.get(url).json()
|
||||
if "bos_token" in response:
|
||||
self.bos_token = response["bos_token"]
|
||||
|
|
@ -308,7 +308,7 @@ class LlamaServer:
|
|||
threading.Thread(target=filter_stderr, args=(self.process.stderr,), daemon=True).start()
|
||||
|
||||
# Wait for server to be healthy
|
||||
health_url = f"http://localhost:{self.port}/health"
|
||||
health_url = f"http://127.0.0.1:{self.port}/health"
|
||||
start_time = time.time()
|
||||
timeout = 3600 * 8 # 8 hours
|
||||
while time.time() - start_time < timeout:
|
||||
|
|
|
|||
Loading…
Reference in a new issue