Bump llama-cpp-python to 0.2.64, use official wheels (#5921)

2026-04-07 23:53:40 +00:00 · 2024-04-23 23:17:05 -03:00 · 2024-04-23 23:17:05 -03:00 · 9b623b8a78
commit 9b623b8a78
parent 0877741b03
16 changed files with 53 additions and 325 deletions
--- a/modules/llama_cpp_python_hijack.py
+++ b/modules/llama_cpp_python_hijack.py
@ -1,25 +1,11 @@
 from typing import Sequence

+import llama_cpp
 from tqdm import tqdm

 from modules import shared
 from modules.cache_utils import process_llamacpp_cache

-try:
-    import llama_cpp
-except:
-    llama_cpp = None
-
-try:
-    import llama_cpp_cuda
-except:
-    llama_cpp_cuda = None
-
-try:
-    import llama_cpp_cuda_tensorcores
-except:
-    llama_cpp_cuda_tensorcores = None
-

 def eval_with_progress(self, tokens: Sequence[int]):
    """
@ -81,7 +67,7 @@ def monkey_patch_generate(lib):
    lib.Llama.generate = my_generate


-for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]:
+for lib in [llama_cpp]:
    if lib is not None:
        lib.Llama.eval = eval_with_progress
        monkey_patch_generate(lib)
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@ -2,6 +2,7 @@ import os
 from pathlib import Path
 from typing import Any, Dict, Optional, Union

+import llama_cpp
 import torch
 from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
@ -10,32 +11,6 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 from modules import RoPE, llama_cpp_python_hijack, shared
 from modules.logging_colors import logger

-try:
-    import llama_cpp
-except:
-    llama_cpp = None
-
-try:
-    import llama_cpp_cuda
-except:
-    llama_cpp_cuda = None
-
-try:
-    import llama_cpp_cuda_tensorcores
-except:
-    llama_cpp_cuda_tensorcores = None
-
-
-def llama_cpp_lib():
-    if shared.args.cpu and llama_cpp is not None:
-        return llama_cpp
-    elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
-        return llama_cpp_cuda_tensorcores
-    elif llama_cpp_cuda is not None:
-        return llama_cpp_cuda
-    else:
-        return llama_cpp
-

 class LlamacppHF(PreTrainedModel):
    def __init__(self, model, path):
@ -57,7 +32,7 @@ class LlamacppHF(PreTrainedModel):
                'n_tokens': self.model.n_tokens,
                'input_ids': self.model.input_ids.copy(),
                'scores': self.model.scores.copy(),
-                'ctx': llama_cpp_lib().llama_new_context_with_model(model.model, model.context_params)
+                'ctx': llama_cpp.llama_new_context_with_model(model.model, model.context_params)
            }

    def _validate_model_class(self):
@ -220,7 +195,7 @@ class LlamacppHF(PreTrainedModel):
            'split_mode': 1 if not shared.args.row_split else 2
        }

-        Llama = llama_cpp_lib().Llama
+        Llama = llama_cpp.Llama
        model = Llama(**params)

        return LlamacppHF(model, model_file)
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@ -1,6 +1,7 @@
 import re
 from functools import partial

+import llama_cpp
 import numpy as np
 import torch

@ -9,32 +10,6 @@ from modules.callbacks import Iteratorize
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length

-try:
-    import llama_cpp
-except:
-    llama_cpp = None
-
-try:
-    import llama_cpp_cuda
-except:
-    llama_cpp_cuda = None
-
-try:
-    import llama_cpp_cuda_tensorcores
-except:
-    llama_cpp_cuda_tensorcores = None
-
-
-def llama_cpp_lib():
-    if shared.args.cpu and llama_cpp is not None:
-        return llama_cpp
-    elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
-        return llama_cpp_cuda_tensorcores
-    elif llama_cpp_cuda is not None:
-        return llama_cpp_cuda
-    else:
-        return llama_cpp
-

 def ban_eos_logits_processor(eos_token, input_ids, logits):
    logits[eos_token] = -float('inf')
@ -60,8 +35,8 @@ class LlamaCppModel:
    @classmethod
    def from_pretrained(self, path):

-        Llama = llama_cpp_lib().Llama
-        LlamaCache = llama_cpp_lib().LlamaCache
+        Llama = llama_cpp.Llama
+        LlamaCache = llama_cpp.LlamaCache

        result = self()
        cache_capacity = 0
@ -126,12 +101,12 @@ class LlamaCppModel:
        if string != self.grammar_string:
            self.grammar_string = string
            if string.strip() != '':
-                self.grammar = llama_cpp_lib().LlamaGrammar.from_string(string)
+                self.grammar = llama_cpp.LlamaGrammar.from_string(string)
            else:
                self.grammar = None

    def generate(self, prompt, state, callback=None):
-        LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
+        LogitsProcessorList = llama_cpp.LogitsProcessorList
        prompt = prompt if type(prompt) is str else prompt.decode()

        # Handle truncation
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -41,11 +41,9 @@ loaders_and_params = OrderedDict({
        'alpha_value',
        'rope_freq_base',
        'compress_pos_emb',
-        'cpu',
        'numa',
        'no_offload_kqv',
        'row_split',
-        'tensorcores',
        'streaming_llm',
        'attention_sink_size',
    ],
@ -62,7 +60,6 @@ loaders_and_params = OrderedDict({
        'alpha_value',
        'rope_freq_base',
        'compress_pos_emb',
-        'cpu',
        'numa',
        'cfg_cache',
        'trust_remote_code',
@ -70,7 +67,6 @@ loaders_and_params = OrderedDict({
        'logits_all',
        'no_offload_kqv',
        'row_split',
-        'tensorcores',
        'streaming_llm',
        'attention_sink_size',
        'llamacpp_HF_info',
--- a/modules/shared.py
+++ b/modules/shared.py
@ -113,7 +113,6 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for

 # llama.cpp
 group = parser.add_argument_group('llama.cpp')
-group.add_argument('--tensorcores', action='store_true', help='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
 group.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
 group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
@ -204,7 +203,8 @@ group = parser.add_argument_group('Multimodal')
 group.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')

 # Deprecated parameters
-# group = parser.add_argument_group('Deprecated')
+group = parser.add_argument_group('Deprecated')
+group.add_argument('--tensorcores', action='store_true', help='DEPRECATED')

 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@ -214,7 +214,7 @@ for arg in sys.argv[1:]:
    if hasattr(args, arg):
        provided_arguments.append(arg)

-deprecated_args = []
+deprecated_args = ['tensorcores']


 def do_cmd_flags_warnings():