Ignore add_bos_token in instruct prompts, let the jinja2 template decide

2025-12-06 07:12:10 +01:00 · 2025-07-10 07:04:47 -07:00 · 2025-07-10 07:04:47 -07:00 · 635e6efd18
parent 0f3a88057c
commit 635e6efd18
3 changed files with 8 additions and 9 deletions
--- a/modules/chat.py
+++ b/modules/chat.py
@ -643,6 +643,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
    output = apply_extensions('history', output)
    state = apply_extensions('state', state)

+    # Let the jinja2 template handle the BOS token
+    if state['mode'] in ['instruct', 'chat-instruct']:
+        state['add_bos_token'] = False
+
    # Initialize metadata if not present
    if 'metadata' not in output:
        output['metadata'] = {}
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -134,7 +134,6 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
                input_ids = np.array(input_ids).reshape(1, len(input_ids))
        else:
            input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
-
            if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None:
                if add_bos_token:
                    # Add BOS token if missing
@ -142,13 +141,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
                        bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]])
                        input_ids = torch.cat((bos_tensor, input_ids), 1)

-                    # Prevent double BOS tokens from jinja templates
+                # Always prevent double BOS tokens (regardless of add_bos_token setting)
                while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id:
                    input_ids = input_ids[:, 1:]
-                else:
-                    # Remove BOS tokens when not wanted
-                    while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
-                        input_ids = input_ids[:, 1:]

        if truncation_length is not None:
            input_ids = input_ids[:, -truncation_length:]
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@ -84,7 +84,7 @@ def create_ui():

                            shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
                            shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
-                            shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
+                            shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Only applies to text completion (notebook). In chat mode, templates control BOS tokens.')
                            shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
                            shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
                            shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')