mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-04-07 23:53:40 +00:00
Bump llama-cpp-python to 0.2.64, use official wheels (#5921)
This commit is contained in:
parent
0877741b03
commit
9b623b8a78
16 changed files with 53 additions and 325 deletions
|
|
@ -1,25 +1,11 @@
|
|||
from typing import Sequence
|
||||
|
||||
import llama_cpp
|
||||
from tqdm import tqdm
|
||||
|
||||
from modules import shared
|
||||
from modules.cache_utils import process_llamacpp_cache
|
||||
|
||||
try:
|
||||
import llama_cpp
|
||||
except:
|
||||
llama_cpp = None
|
||||
|
||||
try:
|
||||
import llama_cpp_cuda
|
||||
except:
|
||||
llama_cpp_cuda = None
|
||||
|
||||
try:
|
||||
import llama_cpp_cuda_tensorcores
|
||||
except:
|
||||
llama_cpp_cuda_tensorcores = None
|
||||
|
||||
|
||||
def eval_with_progress(self, tokens: Sequence[int]):
|
||||
"""
|
||||
|
|
@ -81,7 +67,7 @@ def monkey_patch_generate(lib):
|
|||
lib.Llama.generate = my_generate
|
||||
|
||||
|
||||
for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]:
|
||||
for lib in [llama_cpp]:
|
||||
if lib is not None:
|
||||
lib.Llama.eval = eval_with_progress
|
||||
monkey_patch_generate(lib)
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import os
|
|||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import llama_cpp
|
||||
import torch
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
|
||||
|
|
@ -10,32 +11,6 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
|
|||
from modules import RoPE, llama_cpp_python_hijack, shared
|
||||
from modules.logging_colors import logger
|
||||
|
||||
try:
|
||||
import llama_cpp
|
||||
except:
|
||||
llama_cpp = None
|
||||
|
||||
try:
|
||||
import llama_cpp_cuda
|
||||
except:
|
||||
llama_cpp_cuda = None
|
||||
|
||||
try:
|
||||
import llama_cpp_cuda_tensorcores
|
||||
except:
|
||||
llama_cpp_cuda_tensorcores = None
|
||||
|
||||
|
||||
def llama_cpp_lib():
|
||||
if shared.args.cpu and llama_cpp is not None:
|
||||
return llama_cpp
|
||||
elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
|
||||
return llama_cpp_cuda_tensorcores
|
||||
elif llama_cpp_cuda is not None:
|
||||
return llama_cpp_cuda
|
||||
else:
|
||||
return llama_cpp
|
||||
|
||||
|
||||
class LlamacppHF(PreTrainedModel):
|
||||
def __init__(self, model, path):
|
||||
|
|
@ -57,7 +32,7 @@ class LlamacppHF(PreTrainedModel):
|
|||
'n_tokens': self.model.n_tokens,
|
||||
'input_ids': self.model.input_ids.copy(),
|
||||
'scores': self.model.scores.copy(),
|
||||
'ctx': llama_cpp_lib().llama_new_context_with_model(model.model, model.context_params)
|
||||
'ctx': llama_cpp.llama_new_context_with_model(model.model, model.context_params)
|
||||
}
|
||||
|
||||
def _validate_model_class(self):
|
||||
|
|
@ -220,7 +195,7 @@ class LlamacppHF(PreTrainedModel):
|
|||
'split_mode': 1 if not shared.args.row_split else 2
|
||||
}
|
||||
|
||||
Llama = llama_cpp_lib().Llama
|
||||
Llama = llama_cpp.Llama
|
||||
model = Llama(**params)
|
||||
|
||||
return LlamacppHF(model, model_file)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import re
|
||||
from functools import partial
|
||||
|
||||
import llama_cpp
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
|
@ -9,32 +10,6 @@ from modules.callbacks import Iteratorize
|
|||
from modules.logging_colors import logger
|
||||
from modules.text_generation import get_max_prompt_length
|
||||
|
||||
try:
|
||||
import llama_cpp
|
||||
except:
|
||||
llama_cpp = None
|
||||
|
||||
try:
|
||||
import llama_cpp_cuda
|
||||
except:
|
||||
llama_cpp_cuda = None
|
||||
|
||||
try:
|
||||
import llama_cpp_cuda_tensorcores
|
||||
except:
|
||||
llama_cpp_cuda_tensorcores = None
|
||||
|
||||
|
||||
def llama_cpp_lib():
|
||||
if shared.args.cpu and llama_cpp is not None:
|
||||
return llama_cpp
|
||||
elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
|
||||
return llama_cpp_cuda_tensorcores
|
||||
elif llama_cpp_cuda is not None:
|
||||
return llama_cpp_cuda
|
||||
else:
|
||||
return llama_cpp
|
||||
|
||||
|
||||
def ban_eos_logits_processor(eos_token, input_ids, logits):
|
||||
logits[eos_token] = -float('inf')
|
||||
|
|
@ -60,8 +35,8 @@ class LlamaCppModel:
|
|||
@classmethod
|
||||
def from_pretrained(self, path):
|
||||
|
||||
Llama = llama_cpp_lib().Llama
|
||||
LlamaCache = llama_cpp_lib().LlamaCache
|
||||
Llama = llama_cpp.Llama
|
||||
LlamaCache = llama_cpp.LlamaCache
|
||||
|
||||
result = self()
|
||||
cache_capacity = 0
|
||||
|
|
@ -126,12 +101,12 @@ class LlamaCppModel:
|
|||
if string != self.grammar_string:
|
||||
self.grammar_string = string
|
||||
if string.strip() != '':
|
||||
self.grammar = llama_cpp_lib().LlamaGrammar.from_string(string)
|
||||
self.grammar = llama_cpp.LlamaGrammar.from_string(string)
|
||||
else:
|
||||
self.grammar = None
|
||||
|
||||
def generate(self, prompt, state, callback=None):
|
||||
LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
|
||||
LogitsProcessorList = llama_cpp.LogitsProcessorList
|
||||
prompt = prompt if type(prompt) is str else prompt.decode()
|
||||
|
||||
# Handle truncation
|
||||
|
|
|
|||
|
|
@ -41,11 +41,9 @@ loaders_and_params = OrderedDict({
|
|||
'alpha_value',
|
||||
'rope_freq_base',
|
||||
'compress_pos_emb',
|
||||
'cpu',
|
||||
'numa',
|
||||
'no_offload_kqv',
|
||||
'row_split',
|
||||
'tensorcores',
|
||||
'streaming_llm',
|
||||
'attention_sink_size',
|
||||
],
|
||||
|
|
@ -62,7 +60,6 @@ loaders_and_params = OrderedDict({
|
|||
'alpha_value',
|
||||
'rope_freq_base',
|
||||
'compress_pos_emb',
|
||||
'cpu',
|
||||
'numa',
|
||||
'cfg_cache',
|
||||
'trust_remote_code',
|
||||
|
|
@ -70,7 +67,6 @@ loaders_and_params = OrderedDict({
|
|||
'logits_all',
|
||||
'no_offload_kqv',
|
||||
'row_split',
|
||||
'tensorcores',
|
||||
'streaming_llm',
|
||||
'attention_sink_size',
|
||||
'llamacpp_HF_info',
|
||||
|
|
|
|||
|
|
@ -113,7 +113,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for
|
|||
|
||||
# llama.cpp
|
||||
group = parser.add_argument_group('llama.cpp')
|
||||
group.add_argument('--tensorcores', action='store_true', help='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
|
||||
group.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
|
||||
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
|
||||
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
|
||||
|
|
@ -204,7 +203,8 @@ group = parser.add_argument_group('Multimodal')
|
|||
group.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')
|
||||
|
||||
# Deprecated parameters
|
||||
# group = parser.add_argument_group('Deprecated')
|
||||
group = parser.add_argument_group('Deprecated')
|
||||
group.add_argument('--tensorcores', action='store_true', help='DEPRECATED')
|
||||
|
||||
args = parser.parse_args()
|
||||
args_defaults = parser.parse_args([])
|
||||
|
|
@ -214,7 +214,7 @@ for arg in sys.argv[1:]:
|
|||
if hasattr(args, arg):
|
||||
provided_arguments.append(arg)
|
||||
|
||||
deprecated_args = []
|
||||
deprecated_args = ['tensorcores']
|
||||
|
||||
|
||||
def do_cmd_flags_warnings():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue