llama.cpp: add back the two encode calls (they are harmless now)

This commit is contained in:
oobabooga 2025-04-19 17:38:36 -07:00
parent b9da5c7e3a
commit 5ab069786b
2 changed files with 2 additions and 12 deletions

View file

@ -27,8 +27,6 @@ class LlamaServer:
self.session = requests.Session()
self.vocabulary_size = None
self.bos_token = "<s>"
self.last_input_length = 0
self.last_output_length = 0
# Start the server
self._start_server()
@ -142,9 +140,6 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print()
self.last_input_length = len(token_ids)
self.last_output_length = 0
# Make a direct request with streaming enabled using a context manager
with self.session.post(url, json=payload, stream=True) as response:
response.raise_for_status() # Raise an exception for HTTP errors
@ -172,7 +167,6 @@ class LlamaServer:
# Extract the token content
if data.get('content', ''):
full_text += data['content']
self.last_output_length += 1
yield full_text
# Check if generation is complete

View file

@ -481,12 +481,8 @@ def generate_reply_custom(question, original_question, seed, state, stopping_str
traceback.print_exc()
finally:
t1 = time.time()
if shared.args.loader == 'llama.cpp':
original_tokens = shared.model.last_input_length
new_tokens = shared.model.last_output_length
else:
original_tokens = len(encode(original_question)[0])
new_tokens = len(encode(original_question + reply)[0]) - original_tokens
original_tokens = len(encode(original_question)[0])
new_tokens = len(encode(original_question + reply)[0]) - original_tokens
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
return