Transformers loader: replace use_flash_attention_2/use_eager_attention with a unified attn_implementation

Closes #7107
2026-04-04 14:17:28 +00:00 · 2025-07-09 18:38:45 -07:00 · 2025-07-09 18:38:45 -07:00 · 6c2bdda0f0
commit 6c2bdda0f0
parent 511bb31646
6 changed files with 5 additions and 20 deletions
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@ -135,20 +135,15 @@ def load_model_HF(model_name):
    params = {
        'low_cpu_mem_usage': True,
        'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
+        'attn_implementation': shared.args.attn_implementation,
    }

    if shared.args.trust_remote_code:
        params['trust_remote_code'] = True

-    if shared.args.use_flash_attention_2:
-        params['use_flash_attention_2'] = True
-
    if shared.args.force_safetensors:
        params['force_safetensors'] = True

-    if shared.args.use_eager_attention:
-        params['attn_implementation'] = 'eager'
-
    config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)

    if 'chatglm' in model_name.lower():