diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index ebce987a..02a56b3c 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -27,8 +27,6 @@ class LlamaServer: self.session = requests.Session() self.vocabulary_size = None self.bos_token = "" - self.last_input_length = 0 - self.last_output_length = 0 # Start the server self._start_server() @@ -142,9 +140,6 @@ class LlamaServer: pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() - self.last_input_length = len(token_ids) - self.last_output_length = 0 - # Make a direct request with streaming enabled using a context manager with self.session.post(url, json=payload, stream=True) as response: response.raise_for_status() # Raise an exception for HTTP errors @@ -172,7 +167,6 @@ class LlamaServer: # Extract the token content if data.get('content', ''): full_text += data['content'] - self.last_output_length += 1 yield full_text # Check if generation is complete diff --git a/modules/text_generation.py b/modules/text_generation.py index 675eb379..70f03443 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -481,12 +481,8 @@ def generate_reply_custom(question, original_question, seed, state, stopping_str traceback.print_exc() finally: t1 = time.time() - if shared.args.loader == 'llama.cpp': - original_tokens = shared.model.last_input_length - new_tokens = shared.model.last_output_length - else: - original_tokens = len(encode(original_question)[0]) - new_tokens = len(encode(original_question + reply)[0]) - original_tokens + original_tokens = len(encode(original_question)[0]) + new_tokens = len(encode(original_question + reply)[0]) - original_tokens print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') return