New llama.cpp loader (#6846)

2026-04-04 22:27:29 +00:00 · 2025-04-18 09:59:37 -03:00 · 2025-04-18 09:59:37 -03:00 · ae54d8faaa
commit ae54d8faaa
parent 5c2f8d828e
23 changed files with 471 additions and 999 deletions
--- a/modules/logits.py
+++ b/modules/logits.py
@ -1,6 +1,7 @@
 import time
 import traceback

+import numpy as np
 import torch

 from modules import models, sampler_hijack, shared
@ -38,70 +39,86 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
        return 'Error: No model is loaded1 Select one in the Model tab.', previous

    is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
-    is_non_hf_llamacpp = shared.model.__class__.__name__ == 'LlamaCppModel'
+    is_llamacpp = shared.model.__class__.__name__ == 'LlamaServer'

-    if use_samplers:
-        if any([is_non_hf_exllamav2, is_non_hf_llamacpp]):
-            logger.error("Sampler hijacking is not supported non-Huggingface loaders.")
-            # sampling is all done in c for exllama, so it is really hard to hijack
-            # it should be possible to hijack llamacpp sampler by hijacking all their sampling methods,
-            # but it is not implemented yet
-            return 'Error: Sampler hijacking is not supported non-Huggingface loaders. Please disable the "Use samplers" option.', previous
+    if is_llamacpp:
+        logprobs = shared.model.get_logits(prompt, state, n_probs=top_logits, use_samplers=use_samplers)
+        if return_dict:
+            output = {}
+            for entry in logprobs:
+                token = repr(entry['token'])
+                prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
+                output[token] = prob

-        state['max_new_tokens'] = 1
-        state['auto_max_new_tokens'] = False
-        for _ in generate_reply(prompt, state):
-            pass
-
-        scores = sampler_hijack.global_scores[-1]
-    else:
-        if is_non_hf_exllamav2:
-            device = get_device()
-            tokens = shared.tokenizer.encode(prompt)
-            if device:
-                tokens = tokens.to(device)
-
-            scores = shared.model.get_logits(tokens)[-1][-1]
-        elif is_non_hf_llamacpp:
-            tokens = shared.tokenizer.encode(prompt)
-            scores = shared.model.get_logits(tokens)[-1][-1]
+            return output
        else:
-            device = get_device()
-            tokens = shared.tokenizer.encode(prompt, return_tensors='pt')
-            if device:
-                tokens = tokens.to(device)
+            output = ''
+            for entry in logprobs:
+                token = repr(entry['token'])
+                prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
+                output += f"{prob:.5f}  -  {token}\n"

-            output = shared.model(input_ids=tokens)
-            scores = output['logits'][-1][-1]
-
-    probs = torch.softmax(scores, dim=-1, dtype=torch.float)
-    topk_values, topk_indices = torch.topk(probs, k=top_logits, largest=True, sorted=True)
-    if is_non_hf_llamacpp:
-        topk_indices = [i.expand((1, 1)) for i in topk_indices]
-
-    if hasattr(shared.tokenizer, 'convert_ids_to_tokens'):
-        tokens = [shared.tokenizer.convert_ids_to_tokens(int(i)) for i in topk_indices]
+            return output, previous
    else:
-        tokens = [shared.tokenizer.decode(i) for i in topk_indices]
+        if not use_samplers:
+            state = {'stream': True}

-    if return_dict:
-        topk_values = [float(i) for i in topk_values]
-        output = {}
-        for row in list(zip(topk_values, tokens)):
-            key = row[1]
-            if isinstance(key, bytes):
-                try:
-                    key = key.decode()
-                except:
-                    key = key.decode('latin')
+        if use_samplers:
+            if is_non_hf_exllamav2:
+                logger.error("Sampler hijacking is not supported non-Huggingface loaders.")
+                # sampling is all done in c for exllama, so it is really hard to hijack
+                # it should be possible to hijack llamacpp sampler by hijacking all their sampling methods,
+                # but it is not implemented yet
+                return 'Error: Sampler hijacking is not supported non-Huggingface loaders. Please disable the "Use samplers" option.', previous

-            output[key] = row[0]
+            state['max_new_tokens'] = 1
+            state['auto_max_new_tokens'] = False
+            for _ in generate_reply(prompt, state):
+                pass

-        return output
-    else:
-        topk_values = [f"{float(i):.5f}" for i in topk_values]
-        output = ''
-        for row in list(zip(topk_values, tokens)):
-            output += f"{row[0]}  -  {repr(row[1])}\n"
+            scores = sampler_hijack.global_scores[-1]
+        else:
+            if is_non_hf_exllamav2:
+                device = get_device()
+                tokens = shared.tokenizer.encode(prompt)
+                if device:
+                    tokens = tokens.to(device)

-        return output, previous
+                scores = shared.model.get_logits(tokens)[-1][-1]
+            else:
+                device = get_device()
+                tokens = shared.tokenizer.encode(prompt, return_tensors='pt')
+                if device:
+                    tokens = tokens.to(device)
+
+                output = shared.model(input_ids=tokens)
+                scores = output['logits'][-1][-1]
+
+        probs = torch.softmax(scores, dim=-1, dtype=torch.float)
+        topk_values, topk_indices = torch.topk(probs, k=top_logits, largest=True, sorted=True)
+        if hasattr(shared.tokenizer, 'convert_ids_to_tokens'):
+            tokens = [shared.tokenizer.convert_ids_to_tokens(int(i)) for i in topk_indices]
+        else:
+            tokens = [shared.tokenizer.decode(i) for i in topk_indices]
+
+        if return_dict:
+            topk_values = [float(i) for i in topk_values]
+            output = {}
+            for row in list(zip(topk_values, tokens)):
+                key = row[1]
+                if isinstance(key, bytes):
+                    try:
+                        key = key.decode()
+                    except:
+                        key = key.decode('latin')
+
+                output[key] = row[0]
+
+            return output
+        else:
+            topk_values = [f"{float(i):.5f}" for i in topk_values]
+            output = ''
+            for row in list(zip(topk_values, tokens)):
+                output += f"{row[0]}  -  {repr(row[1])}\n"
+
+            return output, previous