Transformers loader: replace use_flash_attention_2/use_eager_attention with a unified attn_implementation

Closes #7107
This commit is contained in:
oobabooga 2025-07-09 18:38:45 -07:00
parent 511bb31646
commit 6c2bdda0f0
6 changed files with 5 additions and 20 deletions

View file

@ -135,20 +135,15 @@ def load_model_HF(model_name):
params = {
'low_cpu_mem_usage': True,
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
'attn_implementation': shared.args.attn_implementation,
}
if shared.args.trust_remote_code:
params['trust_remote_code'] = True
if shared.args.use_flash_attention_2:
params['use_flash_attention_2'] = True
if shared.args.force_safetensors:
params['force_safetensors'] = True
if shared.args.use_eager_attention:
params['attn_implementation'] = 'eager'
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
if 'chatglm' in model_name.lower():