text-generation-webui/extensions/perplexity_colors/script.py
2025-02-18 11:56:28 -03:00

437 lines
20 KiB
Python

import time
import html
import functools
import re
import gradio
import numpy as np
import torch
from transformers import LogitsProcessor
import colorsys
from modules import html_generator, shared
params = {
'active': True,
'color_by_perplexity': False,
'color_by_probability': False,
'ppl_scale': 15.0, # No slider for this right now, because I don't think it really needs to be changed. Very large perplexity scores don't show up often.
'probability_dropdown': False,
'verbose': False # For debugging mostly
}
class PerplexityLogits(LogitsProcessor):
def __init__(self, verbose=False):
self.generated_token_ids = []
self.selected_probs = []
self.top_token_ids_list = []
self.top_probs_list = []
self.perplexities_list = []
self.last_probs = None
self.verbose = verbose
def __call__(self, input_ids, scores):
#t0 = time.time()
probs = torch.softmax(scores, dim=-1, dtype=torch.float)
log_probs = torch.nan_to_num(torch.log(probs)) # Note: This is to convert log(0) nan to 0, but probs*log_probs makes this 0 not affect the perplexity.
entropy = -torch.sum(probs * log_probs)
entropy = entropy.cpu().numpy()
perplexity = round(float(np.exp(entropy)), 4)
self.perplexities_list.append(perplexity)
last_token_id = int(input_ids[0][-1].cpu().numpy().item())
# Store the generated tokens (not sure why this isn't accessible in the output endpoint!)
self.generated_token_ids.append(last_token_id)
# Get last probability, and add to the list if it wasn't there
if len(self.selected_probs) > 0:
# Is the selected token in the top tokens?
if self.verbose:
print(shared.tokenizer.decode(last_token_id), [shared.tokenizer.decode(token_id) for token_id in self.top_token_ids_list[-1][0]],
[round(float(prob), 4) for prob in self.top_probs_list[-1][0]])
if last_token_id in self.top_token_ids_list[-1][0]:
idx = self.top_token_ids_list[-1][0].index(last_token_id)
self.selected_probs.append(self.top_probs_list[-1][0][idx])
else:
self.top_token_ids_list[-1][0].append(last_token_id)
last_prob = round(float(self.last_probs[last_token_id]), 4)
self.top_probs_list[-1][0].append(last_prob)
self.selected_probs.append(last_prob)
else:
self.selected_probs.append(1.0) # Placeholder for the last token of the prompt
if self.verbose:
pplbar = "-"
if not np.isnan(perplexity):
pplbar = "*" * round(perplexity)
print(f"PPL for token after {shared.tokenizer.decode(last_token_id)}: {perplexity:.2f} {pplbar}")
# Get top 5 probabilities
top_tokens_and_probs = torch.topk(probs, 5)
top_probs = top_tokens_and_probs.values.cpu().numpy().astype(float).tolist()
top_token_ids = top_tokens_and_probs.indices.cpu().numpy().astype(int).tolist()
self.top_token_ids_list.append(top_token_ids)
self.top_probs_list.append(top_probs)
probs = probs.cpu().numpy().flatten()
self.last_probs = probs # Need to keep this as a reference for top probs
#t1 = time.time()
#print(f"PPL Processor: {(t1-t0):.3f} s")
# About 1 ms, though occasionally up to around 100 ms, not sure why...
# Doesn't actually modify the logits!
return scores
# Stores the perplexity and top probabilities
# global ppl_logits_processor
ppl_logits_processor = None
def logits_processor_modifier(logits_processor_list, input_ids):
global ppl_logits_processor
if params['active']:
ppl_logits_processor = PerplexityLogits(verbose=params['verbose'])
logits_processor_list.append(ppl_logits_processor)
def output_modifier(text):
global ppl_logits_processor
#t0 = time.time()
if not params['active'] or ppl_logits_processor is None:
return text
# TODO: It's probably more efficient to do this above rather than modifying all these lists
# Remove last element of perplexities_list, top_token_ids_list, top_tokens_list, top_probs_list since everything is off by one because this extension runs before generation
perplexities = ppl_logits_processor.perplexities_list[:-1]
top_token_ids_list = ppl_logits_processor.top_token_ids_list[:-1]
top_tokens_list = [[shared.tokenizer.decode(token_id) for token_id in top_token_ids[0]] for top_token_ids in top_token_ids_list]
top_probs_list = ppl_logits_processor.top_probs_list[:-1]
# Remove first element of generated_token_ids, generated_tokens, selected_probs because they are for the last token of the prompt
gen_token_ids = ppl_logits_processor.generated_token_ids[1:]
gen_tokens = [shared.tokenizer.decode(token_id) for token_id in gen_token_ids]
sel_probs = ppl_logits_processor.selected_probs[1:]
end_part = '</div></div>' if params['probability_dropdown'] else '</span>' # Helps with finding the index after replacing part of the text.
# Initial space added to deal with some tokenizers...
# Used to find where the message started generating, for working with "continue" generations
# Doesn't work for longer messages... Not sure how I should handle this
full_msg = shared.tokenizer.decode([token_id for token_id in gen_token_ids[:-1]]).strip()
# Space at the beginning to account for tokenization spaces...
text = ' ' + html.unescape(text)
# There was an issue with tab lengths being off by one...
# Seems like it might be model-dependent...
#text = re.sub(r'( {3,})', r'\1 ', text)
# Subtracting 2 to hopefully help with the tokenization spaces and continue issues,
# Though it's possible it could overwrite the previous token if it's the same in the last 2 chars
i = text.find(full_msg) - 2
if i < 0:
# Backup, try removing the extra whitespace (needed for continue)
i = text.find(full_msg.strip()) - 2
if i < 0:
i = 0
#i = 0
# Add token index for ability to regenerate from there
nonwhitespace_token_found = False
for index, token, prob, ppl, top_tokens, top_probs in zip(range(len(gen_tokens)), gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
# Somehow this works without issues, but not sure how...
if not nonwhitespace_token_found and token.strip() == '':
#print('Ignoring initial whitespace token...')
continue
nonwhitespace_token_found = True
max_prob = top_probs[0][0]
color = 'ffffff'
if params['color_by_probability'] and params['color_by_perplexity']:
color = probability_perplexity_color_scale(prob, max_prob, ppl)
elif params['color_by_perplexity']:
color = perplexity_color_scale(ppl)
elif params['color_by_probability']:
color = probability_color_scale(prob)
if token.strip() in text[i:]:
if params['probability_dropdown']:
text = text[:i] + text[i:].replace(token.replace('\n', ''), add_dropdown_html(token, index, color, top_tokens, top_probs[0], ppl), 1)
else:
text = text[:i] + text[i:].replace(token.replace('\n', ''), add_color_html(token, color), 1)
# This might be slightly inefficient
i += text[i:].find(end_part) + len(end_part)
else:
print('Missing token:', token, '...', text[i:i+20])
# Use full perplexity list for calculating the average here.
# Fix issue with mean of empty slice
if len(ppl_logits_processor.perplexities_list) > 1:
print('Average perplexity:', round(np.mean(ppl_logits_processor.perplexities_list[:-1]), 4))
#t1 = time.time()
#print(f"Output modifier: {(t1-t0):.3f} s")
# About 50 ms
return text.strip() # Remove extra beginning whitespace that some tokenizers add
def probability_color_scale(prob):
'''
Green-yellow-red color scale
'''
# hue (0.0 = red, 0.33 = green)
# saturation (0.0 = gray / white, 1.0 = normal, just leave at 1.0)
# brightness (0.0 = black, 1.0 = brightest, use something in between for better readability if you want...)
hue = prob * 0.33
rv, gv, bv = colorsys.hsv_to_rgb(hue, 1.0, 1.0)
# to hex
hex_col = f"{int(rv*255):02x}{int(gv*255):02x}{int(bv*255):02x}"
return hex_col
def perplexity_color_scale(ppl):
'''
Red component only, white for 0 perplexity (sorry if you're not in dark mode)
'''
# hue (0.0 = red)
# saturation (1.0 = red)
# brightness (0.0 = black, 1.0 = red)
# scale saturation from white to red the higher the perplexity
ppl = min(ppl, params['ppl_scale']) # clip ppl to 0-params['ppl_scale'] for color scaling. 15 should be fine for clipping and scaling
sat = ppl / params['ppl_scale']
rv, gv, bv = colorsys.hsv_to_rgb(0.0, sat, 1.0)
# to hex
hex_col = f"{int(rv*255):02x}{int(gv*255):02x}{int(bv*255):02x}"
return hex_col
def probability_perplexity_color_scale(prob, max_prob, ppl):
'''
Green-yellow-red for relative probability compared to maximum for the current token, and blue component for perplexity
'''
hue = prob/max_prob * 0.33
rv, gv, _ = colorsys.hsv_to_rgb(hue, 1.0, 1.0)
ppl = min(ppl, params['ppl_scale']) # clip ppl to 0-params['ppl_scale'] for color scaling. 15 should be fine for clipping and scaling
bv = ppl / params['ppl_scale']
# to hex
hex_col = f"{int(rv*255):02x}{int(gv*255):02x}{int(bv*255):02x}"
return hex_col
def add_color_html(token, color):
output = ''
output += f'<span style="color: #{color}">{html.escape(repr(token)[1:-1])}</span>'
#if '\n' in token or '\r' in token: #token.isspace():
# output += '<br>'
return output
# TODO: Might also need message index for the click-to-regenerate feature to work... For now it only works in the last message, which I think is fine.
# TODO: Major issue: Applying this to too many tokens will cause a permanent slowdown in generation speed until the messages are removed from the history. The slowdown seems to be mostly resolved in the current version though
# I think the issue is from HTML elements taking up space in the visible history, and things like history deepcopy add latency proportional to the size of the history.
# Potential solution is maybe to modify the main generation code to send just the internal text and not the visible history, to avoid moving too much around.
# I wonder if we can also avoid using deepcopy here.
def add_dropdown_html(token, index, color, top_tokens, top_probs, perplexity=0):
#print("Token:", token, token.isspace(), '\n' in token or '\r' in token)
output = ''
# Use the repr to get characters like \n visible. Exclude the quotes around it
output += f'<div class="hoverable" id="tok_{index}"><span style="color: #{color}">{html.escape(repr(token)[1:-1])}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
for i, token_option, prob in zip(range(len(top_tokens)), top_tokens, top_probs):
# TODO: Bold for selected token?
# Using divs prevented the problem of divs inside spans causing issues.
# Now the problem is that divs show the same whitespace of one space between every token.
# There is probably some way to fix this in CSS that I don't know about.
row_color = probability_color_scale(prob)
row_class = ' class="selected"' if token_option == token else ''
# This time we want to include the quotes around it so that we can see where the spaces are.
output += f'<tr{row_class}><td id="opt_{index}_{i}" style="color: #{row_color}">{html.escape(repr(token_option))}</td><td style="color: #{row_color}">{prob:.4f}</td></tr>'
if perplexity != 0:
ppl_color = perplexity_color_scale(perplexity)
output += f'<tr><td>Perplexity:</td><td style="color: #{ppl_color}">{perplexity:.4f}</td></tr>'
output += '</tbody></table></div></div>'
#if '\n' in token or '\r' in token: #token.isspace():
# output += '<br>' # I imagine this will cause problems sometimes
return output # About 750 characters per token...
def custom_css():
return """
.dropdown {
display: none;
position: absolute;
z-index: 50;
background-color: var(--background-fill-secondary);
box-shadow: 0px 8px 16px 0px rgba(0,0,0,1.0);
width: max-content;
overflow: visible;
padding: 5px;
border-radius: 10px;
border: 1px solid var(--border-color-primary);
}
.dropdown-content {
border: none;
z-index: 50;
}
.dropdown-content tr.selected {
background-color: var(--background-fill-primary);
}
.dropdown-content td {
color: var(--body-text-color);
}
.hoverable {
color: var(--body-text-color);
position: relative;
display: inline-block;
overflow: visible;
font-size: 15px;
line-height: 1.75;
margin: 0;
padding: 0;
}
.hoverable:hover .dropdown {
display: block;
}
pre {
white-space: pre-wrap;
}
# TODO: This makes the hover menus extend outside the bounds of the chat area, which is good.
# However, it also makes the scrollbar disappear, which is bad.
# The scroll bar needs to still be present. So for now, we can't see dropdowns that extend past the edge of the chat area.
.chat {
overflow-y: auto;
}
"""
def custom_js():
return """
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
// Note that this will only work as intended on the last agent message
document.addEventListener("click", async function(event) {
//console.log(event.target);
const id = event.target.id;
if (id.includes("opt_")) {
const id_parts = id.split("_");
const token_index = id_parts[1];
const option_index = id_parts[2];
// Exclude the quotes and convert newlines... Not sure about the newlines though
// TODO: Seems like continuing generation from a newline causes problems whether you add it or not!
const token_string = event.target.innerHTML.substring(1, event.target.innerHTML.length-1).replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '');
//console.log(token_index + ", " + option_index + ", " + token_string);
// Get all the previous text (I'm sure there is a more efficient way to do this)
var msg_text = ""
const msg_html = event.target.parentElement.parentElement.parentElement.parentElement.parentElement.parentElement;
var msg_parts = msg_html.childNodes;
for (var i = 0; i < msg_parts.length; i++) {
var msg_part = msg_parts[i];
if (msg_part.nodeType === Node.ELEMENT_NODE) {
if (msg_part.nodeName == "DIV") {
var current_token_index = msg_part.id.split("_")[1];
if (current_token_index == token_index) {
// Use the replacement token
// TODO: Don't have access to the tokenizer here, and sometimes there needs to be a space added before this token
msg_text += token_string //.replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '');
break;
}
else {
// Replace here or at the end?
var text = msg_part.firstChild.innerHTML.replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '')
msg_text += text;
}
}
else {
// Break tag (hacky workaround because the newline literal can't be parsed here)
//msg_text += String.fromCharCode(10);
// Do nothing???
}
}
else if (msg_part.nodeType === Node.TEXT_NODE) {
msg_text += msg_part.textContent;
}
}
var textbox = document.querySelector("#chat-input textarea");
textbox.focus();
textbox.value = msg_text.trimStart() // Fix initial tokenization spaces
//console.log(textbox.value);
// Add some delays to make sure it's processed correctly. Without these, there's a chance the events don't go through correctly and it doesn't work
// It's unknown how long this will take, and probably depends on the size of the message...
// It would be better to somehow wait for gradio to update instead of waiting a fixed amount of time.
// Hopefully 1 second of delay before starting generation isn't unacceptable.
var inputEvent = new Event('input', {
bubbles: true,
cancelable: true,
});
textbox.dispatchEvent(inputEvent);
var changeEvent = new Event('change', {
bubbles: true,
cancelable: true,
});
textbox.dispatchEvent(changeEvent);
await sleep(250);
document.getElementById("Replace-last").click();
// This can take a while to execute
await sleep(750);
document.getElementById("Continue").click();
}
});
console.log("Custom JS for perplexity_colors loaded");
"""
# Monkeypatch applied to html_generator.py
# We simply don't render markdown into HTML. We wrap everything in <pre> tags to preserve whitespace
# formatting. If you're coloring tokens by perplexity or probability, or especially if you're using
# the probability dropdown, you probably care more about seeing the tokens the model actually outputted
# rather than rendering ```code blocks``` or *italics*.
@functools.lru_cache(maxsize=4096)
def convert_to_markdown(string):
return '<pre>' + string + '</pre>'
def convert_to_markdown_wrapped(string, use_cache=True):
if use_cache:
return convert_to_markdown(string)
return convert_to_markdown.__wrapped__(string)
# This is still necessary for formatting to work correctly
html_generator.convert_to_markdown = convert_to_markdown
def ui():
def update_active_check(x):
params.update({'active': x})
def update_color_by_ppl_check(x):
params.update({'color_by_perplexity': x})
def update_color_by_prob_check(x):
params.update({'color_by_probability': x})
def update_prob_dropdown_check(x):
params.update({'probability_dropdown': x})
active_check = gradio.Checkbox(value=True, label="Compute probabilities and perplexity scores", info="Activate this extension. Note that this extension currently does not work with llama.cpp, but it does work with ExLlamav2_HF and llamacpp_HF when set up correctly")
color_by_ppl_check = gradio.Checkbox(value=False, label="Color by perplexity", info="Higher perplexity is more red. If also showing probability, higher perplexity has more blue component.")
color_by_prob_check = gradio.Checkbox(value=False, label="Color by probability", info="Green-yellow-red linear scale, with 100% green, 50% yellow, 0% red.")
prob_dropdown_check = gradio.Checkbox(value=False, label="Probability dropdown", info="Hover over a token to show a dropdown of top token probabilities. Currently slightly buggy with whitespace between tokens.")
active_check.change(update_active_check, active_check, None)
color_by_ppl_check.change(update_color_by_ppl_check, color_by_ppl_check, None)
color_by_prob_check.change(update_color_by_prob_check, color_by_prob_check, None)
prob_dropdown_check.change(update_prob_dropdown_check, prob_dropdown_check, None)