From 5c2f8d828e97c4fd6e16979faa611586077e1072 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 18 Apr 2025 05:42:38 -0700
Subject: [PATCH] Fix exllamav2 generating eos randomly after previous fix

---
 modules/exllamav2_hf.py    | 3 +++
 modules/text_generation.py | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
index 62d1e054..6486e438 100644
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@@ -157,6 +157,9 @@ class Exllamav2HF(PreTrainedModel):
         else:
             self.past_seq = seq_tensor
 
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
         loss = None
         if labels is not None:
             # Shift so that tokens < n predict n
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 2869e03c..eff6495e 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -263,9 +263,6 @@ def apply_stopping_strings(reply, all_stop_strings):
 
 
 def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-
     reply = decode(output_ids[starting_from:], state['skip_special_tokens'] if state else True)
 
     # Handle tokenizers that do not add the leading space for the first token