diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index 62d1e054..6486e438 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -157,6 +157,9 @@ class Exllamav2HF(PreTrainedModel): else: self.past_seq = seq_tensor + if torch.cuda.is_available(): + torch.cuda.synchronize() + loss = None if labels is not None: # Shift so that tokens < n predict n diff --git a/modules/text_generation.py b/modules/text_generation.py index 2869e03c..eff6495e 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -263,9 +263,6 @@ def apply_stopping_strings(reply, all_stop_strings): def get_reply_from_output_ids(output_ids, state=None, starting_from=0): - if torch.cuda.is_available(): - torch.cuda.synchronize() - reply = decode(output_ids[starting_from:], state['skip_special_tokens'] if state else True) # Handle tokenizers that do not add the leading space for the first token