diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index ebce987a..02a56b3c 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -27,8 +27,6 @@ class LlamaServer:
         self.session = requests.Session()
         self.vocabulary_size = None
         self.bos_token = "<s>"
-        self.last_input_length = 0
-        self.last_output_length = 0
 
         # Start the server
         self._start_server()
@@ -142,9 +140,6 @@ class LlamaServer:
             pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
             print()
 
-        self.last_input_length = len(token_ids)
-        self.last_output_length = 0
-
         # Make a direct request with streaming enabled using a context manager
         with self.session.post(url, json=payload, stream=True) as response:
             response.raise_for_status()  # Raise an exception for HTTP errors
@@ -172,7 +167,6 @@ class LlamaServer:
                     # Extract the token content
                     if data.get('content', ''):
                         full_text += data['content']
-                        self.last_output_length += 1
                         yield full_text
 
                     # Check if generation is complete
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 675eb379..70f03443 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -481,12 +481,8 @@ def generate_reply_custom(question, original_question, seed, state, stopping_str
         traceback.print_exc()
     finally:
         t1 = time.time()
-        if shared.args.loader == 'llama.cpp':
-            original_tokens = shared.model.last_input_length
-            new_tokens = shared.model.last_output_length
-        else:
-            original_tokens = len(encode(original_question)[0])
-            new_tokens = len(encode(original_question + reply)[0]) - original_tokens
+        original_tokens = len(encode(original_question)[0])
+        new_tokens = len(encode(original_question + reply)[0]) - original_tokens
 
         print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
         return