From a289a92b9408e2542632ffa600ef57c373200aec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Aug 2025 17:10:58 -0700
Subject: [PATCH] Fix exllamav3 token count

---
 modules/exllamav3.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index e3a2d95a..268a64ec 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -267,9 +267,11 @@ class Exllamav3Model:
 
         input_ids = input_ids[:, -get_max_prompt_length(state):]
 
+        self._last_prompt_token_count = input_ids.shape[-1]
+
         # Determine max_new_tokens
         if state['auto_max_new_tokens']:
-            max_new_tokens = state['truncation_length'] - input_ids.shape[-1]
+            max_new_tokens = state['truncation_length'] - self._last_prompt_token_count
         else:
             max_new_tokens = state['max_new_tokens']
 
@@ -323,8 +325,7 @@ class Exllamav3Model:
 
     @property
     def last_prompt_token_count(self):
-        # This would need to be tracked during generation
-        return 0
+        return getattr(self, '_last_prompt_token_count', 0)
 
     def unload(self):
         logger.info("Unloading ExLlamaV3 model components...")