Use flash_attention_2 by default for Transformers models

2026-04-20 22:13:43 +00:00 · 2025-12-07 06:56:58 -08:00 · 2025-12-07 06:56:58 -08:00 · 85f2df92e9
commit 85f2df92e9
parent 1762312fb4
2 changed files with 2 additions and 2 deletions
--- a/modules/shared.py
+++ b/modules/shared.py
@ -112,7 +112,7 @@ group.add_argument('--no-cache', action='store_true', help='Set use_cache to Fal
 group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
 group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
 group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
-group.add_argument('--attn-implementation', type=str, default='sdpa', metavar="IMPLEMENTATION", help='Attention implementation. Valid options: sdpa, eager, flash_attention_2.')
+group.add_argument('--attn-implementation', type=str, default='flash_attention_2', metavar="IMPLEMENTATION", help='Attention implementation. Valid options: flash_attention_2, sdpa, eager.')

 # bitsandbytes 4-bit
 group = parser.add_argument_group('bitsandbytes 4-bit')