Add TensorRT-LLM support (#5715)

This commit is contained in:
oobabooga 2024-06-24 02:30:03 -03:00 committed by GitHub
parent 536f8d58d4
commit 577a8cd3ee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 197 additions and 4 deletions

View file

@ -54,7 +54,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
yield ''
return
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']:
generate_func = generate_reply_custom
else:
generate_func = generate_reply_HF
@ -132,7 +132,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']:
input_ids = shared.tokenizer.encode(str(prompt))
if shared.model.__class__.__name__ not in ['Exllamav2Model']:
input_ids = np.array(input_ids).reshape(1, len(input_ids))
@ -158,7 +158,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model'] or shared.args.cpu:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
return input_ids
elif shared.args.deepspeed:
import deepspeed