diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index fee54196..8eb03299 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -13,8 +13,8 @@ jobs: - uses: actions/stale@v5 with: stale-issue-message: "" - close-issue-message: "This issue has been closed due to inactivity for 2 months. If you believe it is still relevant, please leave a comment below. You can tag a developer in your comment." - days-before-issue-stale: 60 + close-issue-message: "This issue has been closed due to inactivity for 6 months. If you believe it is still relevant, please leave a comment below. You can tag a developer in your comment." + days-before-issue-stale: 180 days-before-issue-close: 0 stale-issue-label: "stale" days-before-pr-stale: -1 diff --git a/README.md b/README.md index a699a9ab..faeeb1e9 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features * 3 interface modes: default (two columns), notebook, and chat. -* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). +* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). * Dropdown menu for quickly switching between different models. * Large number of extensions (built-in and user-contributed), including Coqui TTS for realistic voice outputs, Whisper STT for voice inputs, translation, [multimodal pipelines](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal), vector databases, Stable Diffusion integration, and a lot more. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details. * [Chat with custom characters](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab#character). @@ -76,12 +76,12 @@ conda activate textgen | System | GPU | Command | |--------|---------|---------| -| Linux/WSL | NVIDIA | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121` | -| Linux/WSL | CPU only | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cpu` | -| Linux | AMD | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/rocm5.6` | -| MacOS + MPS | Any | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1` | -| Windows | NVIDIA | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121` | -| Windows | CPU only | `pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1` | +| Linux/WSL | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` | +| Linux/WSL | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu` | +| Linux | AMD | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/rocm5.6` | +| MacOS + MPS | Any | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` | +| Windows | NVIDIA | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121` | +| Windows | CPU only | `pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2` | The up-to-date commands can be found here: https://pytorch.org/get-started/locally/. @@ -146,7 +146,7 @@ Then browse to 1) For Kepler GPUs and older, you will need to install CUDA 11.8 instead of 12: ``` -pip3 install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118 +pip3 install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118 conda install -y -c "nvidia/label/cuda-11.8.0" cuda-runtime ``` diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 8891d180..bdf68aad 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -62,8 +62,8 @@ .gradio-container .chat .user-message { padding: 20px; - padding-left: 0px; - padding-right: 0px; + padding-left: 0; + padding-right: 0; background-color: transparent; border-radius: 8px; border-bottom-right-radius: 0; diff --git a/css/main.css b/css/main.css index a1ff2a36..498b3c6c 100644 --- a/css/main.css +++ b/css/main.css @@ -96,7 +96,7 @@ gradio-app > :first-child { .header_bar { background-color: #f7f7f7; - box-shadow: 0 0px 3px rgba(22 22 22 / 35%); + box-shadow: 0 0 3px rgba(22 22 22 / 35%); margin-bottom: 0; overflow-x: scroll; margin-left: calc(-1 * var(--size-4)); @@ -221,6 +221,7 @@ button { .pretty_scrollbar::-webkit-scrollbar { width: 7px; + height: 7px; } .pretty_scrollbar::-webkit-scrollbar-track { @@ -245,6 +246,10 @@ button { background: #374151; } +.pretty_scrollbar::-webkit-scrollbar-corner { + background: transparent; +} + audio { max-width: 100%; } @@ -433,12 +438,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .message-body code { white-space: pre-wrap !important; word-wrap: break-word !important; - border: 1px solid #666666; + border: 1px solid #666; border-radius: 5px; font-size: 82%; padding: 1px 3px; background: #0d1117 !important; - color: rgb(201, 209, 217); + color: rgb(201 209 217); } .message-body pre > code { @@ -695,7 +700,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { @media screen and (width >= 1327px) { #past-chats-row { position: absolute; - top: 16px; + top: 36px; left: 0; width: calc(0.5*(var(--document-width) - 880px - 120px - 16px*2)); max-width: 300px; @@ -743,3 +748,47 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { display: none; } } + +#past-chats { + max-height: calc(100vh - 195px); + overflow-y: scroll !important; + border-radius: 0; + scrollbar-width: none; /* Hide scrollbar in Firefox by default */ +} + +#past-chats label { + width: 100%; + background-color: transparent !important; + background: none; + border: 0; + border-radius: 0; + padding-top: 8px; + padding-bottom: 8px; +} + +#past-chats > :nth-child(2) { + display: none; +} + +#past-chats > :nth-child(3) { + gap: 0; +} + +#past-chats::-webkit-scrollbar { + display: none; +} + +#past-chats:hover { + scrollbar-width: auto; +} + +#past-chats:hover::-webkit-scrollbar { + display: block; +} + +@media screen and (width < 1327px) { + #past-chats { + max-height: 300px; + } +} + diff --git a/docker/TensorRT-LLM/Dockerfile b/docker/TensorRT-LLM/Dockerfile new file mode 100644 index 00000000..ae503c94 --- /dev/null +++ b/docker/TensorRT-LLM/Dockerfile @@ -0,0 +1,27 @@ +FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-runtime + +# Install Git +RUN apt update && apt install -y git + +# System-wide TensorRT-LLM requirements +RUN apt install -y openmpi-bin libopenmpi-dev + +# Set the working directory +WORKDIR /app + +# Install text-generation-webui +RUN git clone https://github.com/oobabooga/text-generation-webui +WORKDIR /app/text-generation-webui +RUN pip install -r requirements.txt + +# This is needed to avoid an error about "Failed to build mpi4py" in the next command +ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH + +# Install TensorRT-LLM +RUN pip3 install tensorrt_llm==0.10.0 -U --pre --extra-index-url https://pypi.nvidia.com + +# Expose the necessary port for the Python server +EXPOSE 7860 5000 + +# Run the Python server.py script with the specified command +CMD ["python", "server.py", "--api", "--listen"] diff --git a/docs/02 - Default and Notebook Tabs.md b/docs/02 - Default and Notebook Tabs.md index c450635e..4bb78448 100644 --- a/docs/02 - Default and Notebook Tabs.md +++ b/docs/02 - Default and Notebook Tabs.md @@ -18,13 +18,13 @@ In the **Prompt** menu, you can select from some predefined prompts defined unde ### Output -Four tabs can be found: +Five tabs can be found: * **Raw**: where the raw text generated by the model appears. * **Markdown**: it contains a "Render" button. You can click on it at any time to render the current output as markdown. This is particularly useful for models that generate LaTeX equations like GALACTICA. * **HTML**: displays the output in an HTML style that is meant to be easier to read. Its style is defined under `text-generation-webui/css/html_readable_style.css`. * **Logits**: when you click on "Get next token probabilities", this tab displays the 50 most likely next tokens and their probabilities based on your current input. If "Use samplers" is checked, the probabilities will be the ones after the sampling parameters in the "Parameters" > "Generation" tab are applied. Otherwise, they will be the raw probabilities generated by the model. -* **Tokens**: allows you to tokenize your prompt and see the ID numbers for the individuals tokens. +* **Tokens**: allows you to tokenize your prompt and see the ID numbers for the individual tokens. ## Notebook tab diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index fee57d33..b00a1f34 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -219,7 +219,7 @@ print() ### Environment variables -The following environment variables can be used (they take precendence over everything else): +The following environment variables can be used (they take precedence over everything else): | Variable Name | Description | Example Value | |------------------------|------------------------------------|----------------------------| diff --git a/docs/README.md b/docs/README.md index d2efbf1d..666ee85c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,4 +1,4 @@ -These files is a mirror of the documentation at: +These files are a mirror of the documentation at: # https://github.com/oobabooga/text-generation-webui/wiki diff --git a/js/main.js b/js/main.js index 72568e1b..e9a980e2 100644 --- a/js/main.js +++ b/js/main.js @@ -98,20 +98,6 @@ document.addEventListener("keydown", function(event) { document.getElementById("Impersonate").click(); } - // Switch between tabs on Tab - else if (!event.ctrlKey && !event.shiftKey && !event.altKey && !event.metaKey && event.key === "Tab") { - event.preventDefault(); - var parametersButton = document.getElementById("parameters-button"); - var parentContainer = parametersButton.parentNode; - var selectedChild = parentContainer.querySelector(".selected"); - - if (selectedChild.id == "parameters-button") { - document.getElementById(previousTabId).click(); - } else { - previousTabId = selectedChild.id; - parametersButton.click(); - } - } }); //------------------------------------------------ @@ -548,3 +534,8 @@ document.querySelectorAll(".focus-on-chat-input").forEach(element => { document.querySelector("#chat-input textarea").focus(); }); }); + +//------------------------------------------------ +// Fix a border around the "past chats" menu +//------------------------------------------------ +document.getElementById("past-chats").parentNode.style.borderRadius = "0px"; diff --git a/modules/LoRA.py b/modules/LoRA.py index 15132f4e..eda5e406 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -73,7 +73,7 @@ def add_lora_autogptq(lora_names): if len(lora_names) > 1: logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.') if not shared.args.no_inject_fused_attention: - logger.warning('Fused Atttention + AutoGPTQ may break Lora loading. Disable it.') + logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.') peft_config = GPTQLoraConfig( inference_mode=True, diff --git a/modules/RoPE.py b/modules/RoPE.py deleted file mode 100644 index 31163a33..00000000 --- a/modules/RoPE.py +++ /dev/null @@ -1,18 +0,0 @@ -def get_alpha_value(alpha, base): - ''' - Gets alpha_value from alpha_value and rope_freq_base - ''' - if base > 0: - return (base / 10000.) ** (63 / 64.) - else: - return alpha - - -def get_rope_freq_base(alpha, base): - ''' - Gets rope_freq_base from alpha_value and rope_freq_base - ''' - if base > 0: - return base - else: - return 10000 * alpha ** (64 / 63.) diff --git a/modules/block_requests.py b/modules/block_requests.py index 0eb10fa2..778b9f5a 100644 --- a/modules/block_requests.py +++ b/modules/block_requests.py @@ -43,19 +43,27 @@ def my_open(*args, **kwargs): with original_open(*args, **kwargs) as f: file_contents = f.read() - file_contents = file_contents.replace(b'\t\t', b'') - file_contents = file_contents.replace(b'cdnjs.cloudflare.com', b'127.0.0.1') + if len(args) > 1 and args[1] == 'rb': + file_contents = file_contents.decode('utf-8') + + file_contents = file_contents.replace('\t\t', '') + file_contents = file_contents.replace('cdnjs.cloudflare.com', '127.0.0.1') file_contents = file_contents.replace( - b'', - b'\n ' - b'\n ' - b'\n ' - b'\n ' - b'\n ' - b'\n ' + '', + '\n ' + '\n ' + '\n ' + '\n ' + '\n ' + '\n ' ) - return io.BytesIO(file_contents) + if len(args) > 1 and args[1] == 'rb': + file_contents = file_contents.encode('utf-8') + return io.BytesIO(file_contents) + else: + return io.StringIO(file_contents) + else: return original_open(*args, **kwargs) diff --git a/modules/chat.py b/modules/chat.py index 43f5466b..5d2bdd63 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -492,7 +492,7 @@ def save_history(history, unique_id, character, mode): p.parent.mkdir(parents=True) with open(p, 'w', encoding='utf-8') as f: - f.write(json.dumps(history, indent=4)) + f.write(json.dumps(history, indent=4, ensure_ascii=False)) def rename_history(old_id, new_id, character, mode): @@ -505,17 +505,16 @@ def rename_history(old_id, new_id, character, mode): logger.error(f"The following path is not allowed: \"{new_p}\".") elif new_p == old_p: logger.info("The provided path is identical to the old one.") + elif new_p.exists(): + logger.error(f"The new path already exists and will not be overwritten: \"{new_p}\".") else: logger.info(f"Renaming \"{old_p}\" to \"{new_p}\"") old_p.rename(new_p) -def find_all_histories(state): - if shared.args.multi_user: - return [''] - +def get_paths(state): if state['mode'] == 'instruct': - paths = Path('logs/instruct').glob('*.json') + return Path('logs/instruct').glob('*.json') else: character = state['character_menu'] @@ -533,12 +532,55 @@ def find_all_histories(state): p.parent.mkdir(exist_ok=True) new_p.rename(p) - paths = Path(f'logs/chat/{character}').glob('*.json') + return Path(f'logs/chat/{character}').glob('*.json') + +def find_all_histories(state): + if shared.args.multi_user: + return [''] + + paths = get_paths(state) histories = sorted(paths, key=lambda x: x.stat().st_mtime, reverse=True) - histories = [path.stem for path in histories] + return [path.stem for path in histories] - return histories + +def find_all_histories_with_first_prompts(state): + if shared.args.multi_user: + return [] + + paths = get_paths(state) + histories = sorted(paths, key=lambda x: x.stat().st_mtime, reverse=True) + + result = [] + for i, path in enumerate(histories): + filename = path.stem + if re.match(r'^[0-9]{8}-[0-9]{2}-[0-9]{2}-[0-9]{2}$', filename): + with open(path, 'r', encoding='utf-8') as f: + data = json.load(f) + + first_prompt = "" + if 'visible' in data and len(data['visible']) > 0: + if data['internal'][0][0] == '<|BEGIN-VISIBLE-CHAT|>': + if len(data['visible']) > 1: + first_prompt = html.unescape(data['visible'][1][0]) + elif i == 0: + first_prompt = "New chat" + else: + first_prompt = html.unescape(data['visible'][0][0]) + elif i == 0: + first_prompt = "New chat" + else: + first_prompt = filename + + first_prompt = first_prompt.strip() + + # Truncate the first prompt if it's longer than 32 characters + if len(first_prompt) > 32: + first_prompt = first_prompt[:29] + '...' + + result.append((first_prompt, filename)) + + return result def load_latest_history(state): @@ -569,17 +611,17 @@ def load_history_after_deletion(state, idx): if shared.args.multi_user: return start_new_chat(state) - histories = find_all_histories(state) + histories = find_all_histories_with_first_prompts(state) idx = min(int(idx), len(histories) - 1) idx = max(0, idx) if len(histories) > 0: - history = load_history(histories[idx], state['character_menu'], state['mode']) + history = load_history(histories[idx][1], state['character_menu'], state['mode']) else: history = start_new_chat(state) - histories = find_all_histories(state) + histories = find_all_histories_with_first_prompts(state) - return history, gr.update(choices=histories, value=histories[idx]) + return history, gr.update(choices=histories, value=histories[idx][1]) def update_character_menu_after_deletion(idx): diff --git a/modules/github.py b/modules/github.py index 282267b6..f3dc26e1 100644 --- a/modules/github.py +++ b/modules/github.py @@ -32,7 +32,7 @@ def clone_or_pull_repository(github_url): yield f"Cloning {github_url}..." clone_output = subprocess.check_output(["git", "clone", github_url, repo_path], stderr=subprocess.STDOUT) new_extensions.add(repo_name) - yield f"The extension `{repo_name}` has been downloaded.\n\nPlease close the the web UI completely and launch it again to be able to load it." + yield f"The extension `{repo_name}` has been downloaded.\n\nPlease close the web UI completely and launch it again to be able to load it." return clone_output.decode() except subprocess.CalledProcessError as e: return str(e) diff --git a/modules/html_generator.py b/modules/html_generator.py index 7e50f561..657133bd 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -85,15 +85,20 @@ def convert_to_markdown(string): # Unfinished list, like "\n1.". A |delete| string is added and then # removed to force a
. - if re.search(r'(\n\d+\.?|\n\*\s*)$', result): + list_item_pattern = r'(\n\d+\.?|\n\s*[-*+]\s*([*_~]{1,3})?)$' + if re.search(list_item_pattern, result): delete_str = '|delete|' if re.search(r'(\d+\.?)$', result) and not result.endswith('.'): result += '.' - result = re.sub(r'(\n\d+\.?|\n\*\s*)$', r'\g<1> ' + delete_str, result) + # Add the delete string after the list item + result = re.sub(list_item_pattern, r'\g<1> ' + delete_str, result) + # Convert to HTML using markdown html_output = markdown.markdown(result, extensions=['fenced_code', 'tables']) + + # Remove the delete string from the HTML output pos = html_output.rfind(delete_str) if pos > -1: html_output = html_output[:pos] + html_output[pos + len(delete_str):] diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index f30be66a..74af5fbf 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel from transformers.modeling_outputs import CausalLMOutputWithPast -from modules import RoPE, llama_cpp_python_hijack, shared +from modules import llama_cpp_python_hijack, shared from modules.logging_colors import logger try: @@ -212,7 +212,7 @@ class LlamacppHF(PreTrainedModel): 'mul_mat_q': not shared.args.no_mul_mat_q, 'numa': shared.args.numa, 'n_gpu_layers': shared.args.n_gpu_layers, - 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), + 'rope_freq_base': shared.args.rope_freq_base, 'tensor_split': tensor_split_list, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'logits_all': shared.args.logits_all, diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index b2a25d36..d62fd517 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -4,7 +4,7 @@ from functools import partial import numpy as np import torch -from modules import RoPE, llama_cpp_python_hijack, shared +from modules import llama_cpp_python_hijack, shared from modules.callbacks import Iteratorize from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length @@ -92,7 +92,7 @@ class LlamaCppModel: 'mul_mat_q': not shared.args.no_mul_mat_q, 'numa': shared.args.numa, 'n_gpu_layers': shared.args.n_gpu_layers, - 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), + 'rope_freq_base': shared.args.rope_freq_base, 'tensor_split': tensor_split_list, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'offload_kqv': not shared.args.no_offload_kqv, diff --git a/modules/loaders.py b/modules/loaders.py index 5099ffb0..1da37595 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -22,7 +22,6 @@ loaders_and_params = OrderedDict({ 'no_use_fast', 'use_flash_attention_2', 'alpha_value', - 'rope_freq_base', 'compress_pos_emb', 'disable_exllama', 'disable_exllamav2', @@ -38,7 +37,6 @@ loaders_and_params = OrderedDict({ 'no_mmap', 'mlock', 'no_mul_mat_q', - 'alpha_value', 'rope_freq_base', 'compress_pos_emb', 'cpu', @@ -60,7 +58,6 @@ loaders_and_params = OrderedDict({ 'no_mmap', 'mlock', 'no_mul_mat_q', - 'alpha_value', 'rope_freq_base', 'compress_pos_emb', 'cpu', @@ -134,6 +131,11 @@ loaders_and_params = OrderedDict({ 'hqq_backend', 'trust_remote_code', 'no_use_fast', + ], + 'TensorRT-LLM': [ + 'max_seq_len', + 'cpp_runner', + 'tensorrt_llm_info', ] }) @@ -319,6 +321,16 @@ loaders_samplers = { 'skip_special_tokens', 'auto_max_new_tokens', }, + 'TensorRT-LLM': { + 'temperature', + 'top_p', + 'top_k', + 'repetition_penalty', + 'presence_penalty', + 'frequency_penalty', + 'ban_eos_token', + 'auto_max_new_tokens', + } } diff --git a/modules/logits.py b/modules/logits.py index 1fe2e73e..4233c8a5 100644 --- a/modules/logits.py +++ b/modules/logits.py @@ -16,15 +16,20 @@ def get_next_logits(*args, **kwargs): if shared.args.idle_timeout > 0 and shared.model is None and shared.previous_model_name not in [None, 'None']: shared.model, shared.tokenizer = load_model(shared.previous_model_name) - shared.generation_lock.acquire() + needs_lock = not args[2] # use_samplers + if needs_lock: + shared.generation_lock.acquire() + try: result = _get_next_logits(*args, **kwargs) except Exception: traceback.print_exc() result = None - models.last_generation_time = time.time() - shared.generation_lock.release() + if needs_lock: + models.last_generation_time = time.time() + shared.generation_lock.release() + return result diff --git a/modules/models.py b/modules/models.py index cb32a3da..da741cb0 100644 --- a/modules/models.py +++ b/modules/models.py @@ -25,7 +25,7 @@ from transformers import ( ) import modules.shared as shared -from modules import RoPE, sampler_hijack +from modules import sampler_hijack from modules.logging_colors import logger from modules.models_settings import get_model_metadata @@ -77,6 +77,7 @@ def load_model(model_name, loader=None): 'ExLlamav2_HF': ExLlamav2_HF_loader, 'AutoAWQ': AutoAWQ_loader, 'HQQ': HQQ_loader, + 'TensorRT-LLM': TensorRT_LLM_loader, } metadata = get_model_metadata(model_name) @@ -101,7 +102,7 @@ def load_model(model_name, loader=None): tokenizer = load_tokenizer(model_name, model) shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) - if loader.lower().startswith('exllama'): + if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt'): shared.settings['truncation_length'] = shared.args.max_seq_len elif loader in ['llama.cpp', 'llamacpp_HF']: shared.settings['truncation_length'] = shared.args.n_ctx @@ -248,7 +249,7 @@ def huggingface_loader(model_name): if shared.args.compress_pos_emb > 1: params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb} elif shared.args.alpha_value > 1: - params['rope_scaling'] = {'type': 'dynamic', 'factor': RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)} + params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value} logger.info("TRANSFORMERS_PARAMS=") pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params) @@ -337,6 +338,13 @@ def HQQ_loader(model_name): return model +def TensorRT_LLM_loader(model_name): + from modules.tensorrt_llm import TensorRTLLMModel + + model = TensorRTLLMModel.from_pretrained(model_name) + return model + + def get_max_memory_dict(): max_memory = {} max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' diff --git a/modules/models_settings.py b/modules/models_settings.py index c3712db2..2e3fff9c 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -16,6 +16,7 @@ def get_fallback_settings(): 'n_ctx': 2048, 'rope_freq_base': 0, 'compress_pos_emb': 1, + 'alpha_value': 1, 'truncation_length': shared.settings['truncation_length'], 'skip_special_tokens': shared.settings['skip_special_tokens'], 'custom_stopping_strings': shared.settings['custom_stopping_strings'], @@ -58,13 +59,19 @@ def get_model_metadata(model): model_settings['rope_freq_base'] = metadata[k] elif k.endswith('rope.scale_linear'): model_settings['compress_pos_emb'] = metadata[k] + elif k.endswith('rope.scaling.factor'): + model_settings['compress_pos_emb'] = metadata[k] elif k.endswith('block_count'): model_settings['n_gpu_layers'] = metadata[k] + 1 if 'tokenizer.chat_template' in metadata: template = metadata['tokenizer.chat_template'] eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']] - bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']] + if 'tokenizer.ggml.bos_token_id' in metadata: + bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']] + else: + bos_token = "" + template = template.replace('eos_token', "'{}'".format(eos_token)) template = template.replace('bos_token', "'{}'".format(bos_token)) @@ -77,6 +84,9 @@ def get_model_metadata(model): # Transformers metadata if hf_metadata is not None: metadata = json.loads(open(path, 'r', encoding='utf-8').read()) + if 'pretrained_config' in metadata: + metadata = metadata['pretrained_config'] + for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']: if k in metadata: model_settings['truncation_length'] = metadata[k] diff --git a/modules/shared.py b/modules/shared.py index 373089dc..ebbfc268 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -165,6 +165,10 @@ group.add_argument('--no_inject_fused_attention', action='store_true', help='Dis group = parser.add_argument_group('HQQ') group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') +# TensorRT-LLM +group = parser.add_argument_group('TensorRT-LLM') +group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.') + # DeepSpeed group = parser.add_argument_group('DeepSpeed') group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.') @@ -263,6 +267,8 @@ def fix_loader_name(name): return 'AutoAWQ' elif name in ['hqq']: return 'HQQ' + elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']: + return 'TensorRT-LLM' def add_extension(name, last=False): diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py new file mode 100644 index 00000000..c2685b75 --- /dev/null +++ b/modules/tensorrt_llm.py @@ -0,0 +1,131 @@ +from pathlib import Path + +import tensorrt_llm +import torch +from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp + +from modules import shared +from modules.logging_colors import logger +from modules.text_generation import ( + get_max_prompt_length, + get_reply_from_output_ids +) + + +class TensorRTLLMModel: + def __init__(self): + pass + + @classmethod + def from_pretrained(self, path_to_model): + + path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model) + runtime_rank = tensorrt_llm.mpi_rank() + + # Define model settings + runner_kwargs = dict( + engine_dir=str(path_to_model), + lora_dir=None, + rank=runtime_rank, + debug_mode=False, + lora_ckpt_source="hf", + ) + + if shared.args.cpp_runner: + logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"") + runner_kwargs.update( + max_batch_size=1, + max_input_len=shared.args.max_seq_len - 512, + max_output_len=512, + max_beam_width=1, + max_attention_window_size=None, + sink_token_length=None, + ) + else: + logger.info("TensorRT-LLM: Using \"ModelRunner\"") + + # Load the model + runner_cls = ModelRunnerCpp if shared.args.cpp_runner else ModelRunner + runner = runner_cls.from_dir(**runner_kwargs) + + result = self() + result.model = runner + result.runtime_rank = runtime_rank + + return result + + def generate_with_streaming(self, prompt, state): + batch_input_ids = [] + input_ids = shared.tokenizer.encode( + prompt, + add_special_tokens=True, + truncation=False, + ) + input_ids = torch.tensor(input_ids, dtype=torch.int32) + input_ids = input_ids[-get_max_prompt_length(state):] # Apply truncation_length + batch_input_ids.append(input_ids) + + if shared.args.cpp_runner: + max_new_tokens = min(512, state['max_new_tokens']) + elif state['auto_max_new_tokens']: + max_new_tokens = state['truncation_length'] - input_ids.shape[-1] + else: + max_new_tokens = state['max_new_tokens'] + + with torch.no_grad(): + generator = self.model.generate( + batch_input_ids, + max_new_tokens=max_new_tokens, + max_attention_window_size=None, + sink_token_length=None, + end_id=shared.tokenizer.eos_token_id if not state['ban_eos_token'] else -1, + pad_id=shared.tokenizer.pad_token_id or shared.tokenizer.eos_token_id, + temperature=state['temperature'], + top_k=state['top_k'], + top_p=state['top_p'], + num_beams=1, + length_penalty=1.0, + repetition_penalty=state['repetition_penalty'], + presence_penalty=state['presence_penalty'], + frequency_penalty=state['frequency_penalty'], + stop_words_list=None, + bad_words_list=None, + lora_uids=None, + prompt_table_path=None, + prompt_tasks=None, + streaming=not shared.args.cpp_runner, + output_sequence_lengths=True, + return_dict=True, + medusa_choices=None + ) + + torch.cuda.synchronize() + + cumulative_reply = '' + starting_from = batch_input_ids[0].shape[-1] + + if shared.args.cpp_runner: + sequence_length = generator['sequence_lengths'][0].item() + output_ids = generator['output_ids'][0][0][:sequence_length].tolist() + + cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from) + starting_from = sequence_length + yield cumulative_reply + else: + for curr_outputs in generator: + if shared.stop_everything: + break + + sequence_length = curr_outputs['sequence_lengths'][0].item() + output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist() + + cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from) + starting_from = sequence_length + yield cumulative_reply + + def generate(self, prompt, state): + output = '' + for output in self.generate_with_streaming(prompt, state): + pass + + return output diff --git a/modules/text_generation.py b/modules/text_generation.py index ca42ba1f..d971a30e 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -54,7 +54,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap yield '' return - if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']: generate_func = generate_reply_custom else: generate_func = generate_reply_HF @@ -132,7 +132,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if shared.tokenizer is None: raise ValueError('No tokenizer is loaded') - if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']: input_ids = shared.tokenizer.encode(str(prompt)) if shared.model.__class__.__name__ not in ['Exllamav2Model']: input_ids = np.array(input_ids).reshape(1, len(input_ids)) @@ -158,7 +158,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model'] or shared.args.cpu: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu: return input_ids elif shared.args.deepspeed: import deepspeed diff --git a/modules/ui.py b/modules/ui.py index f88c0a82..c20a7888 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -106,6 +106,7 @@ def list_model_elements(): 'streaming_llm', 'attention_sink_size', 'hqq_backend', + 'cpp_runner', ] if is_torch_xpu_available(): for i in range(torch.xpu.device_count()): diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 3193bd67..91951624 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -19,7 +19,7 @@ def create_ui(): mu = shared.args.multi_user shared.gradio['Chat input'] = gr.State() - shared.gradio['history'] = gr.State({'internal': [], 'visible': []}) + shared.gradio['history'] = gr.JSON({'internal': [], 'visible': []}, visible=False) with gr.Tab('Chat', elem_id='chat-tab', elem_classes=("old-ui" if shared.args.chat_buttons else None)): with gr.Row(): @@ -62,9 +62,6 @@ def create_ui(): with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']): with gr.Column(): - with gr.Row(): - shared.gradio['unique_id'] = gr.Dropdown(label='Past chats', elem_classes=['slim-dropdown'], interactive=not mu) - with gr.Row(): shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu) shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu) @@ -74,8 +71,13 @@ def create_ui(): with gr.Row(elem_id='rename-row'): shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', visible=False, elem_classes=['no-background']) - shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input']) - shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input']) + with gr.Row(): + shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input']) + shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input']) + + gr.Markdown("Past chats") + with gr.Row(): + shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats') with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']): with gr.Column(): @@ -83,7 +85,12 @@ def create_ui(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) with gr.Row(): - shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value='chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode') + shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode') + + with gr.Row(): + shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', elem_classes='slim-dropdown') + shared.gradio['refresh_character'] = ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu) + shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu) with gr.Row(): shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct') @@ -98,15 +105,10 @@ def create_chat_settings_ui(): with gr.Row(): with gr.Column(scale=8): with gr.Tab("Character"): - with gr.Row(): - shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown') - ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu) - shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu) - shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu) - shared.gradio['name2'] = gr.Textbox(value='', lines=1, label='Character\'s name') shared.gradio['context'] = gr.Textbox(value='', lines=10, label='Context', elem_classes=['add_scrollbar']) shared.gradio['greeting'] = gr.Textbox(value='', lines=5, label='Greeting', elem_classes=['add_scrollbar']) + shared.gradio['save_character'] = gr.Button('Save character', elem_classes=['small-button'], interactive=not mu) with gr.Tab("User"): shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Name') @@ -181,7 +183,7 @@ def create_event_handlers(): chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['textbox'].submit( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( @@ -189,28 +191,28 @@ def create_event_handlers(): chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Regenerate'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( partial(chat.generate_chat_reply_wrapper, regenerate=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Continue'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( partial(chat.generate_chat_reply_wrapper, _continue=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Impersonate'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then( chat.impersonate_wrapper, gradio(inputs), gradio('textbox', 'display'), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Replace last reply'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( @@ -252,7 +254,7 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.start_new_chat, gradio('interface_state'), gradio('history')).then( chat.redraw_html, gradio(reload_arr), gradio('display')).then( - lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')) + lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False) shared.gradio['delete_chat'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, gradio(clear_arr)) shared.gradio['delete_chat-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr)) @@ -260,12 +262,12 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( lambda x, y: str(chat.find_all_histories(x).index(y)), gradio('interface_state', 'unique_id'), gradio('temporary_text')).then( chat.delete_history, gradio('unique_id', 'character_menu', 'mode'), None).then( - chat.load_history_after_deletion, gradio('interface_state', 'temporary_text'), gradio('history', 'unique_id')).then( + chat.load_history_after_deletion, gradio('interface_state', 'temporary_text'), gradio('history', 'unique_id'), show_progress=False).then( chat.redraw_html, gradio(reload_arr), gradio('display')).then( lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr)) shared.gradio['rename_chat'].click( - lambda x: x, gradio('unique_id'), gradio('rename_to')).then( + lambda: "My New Chat", None, gradio('rename_to')).then( lambda: [gr.update(visible=True)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False) shared.gradio['rename_to-cancel'].click( @@ -274,36 +276,36 @@ def create_event_handlers(): shared.gradio['rename_to-confirm'].click( chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then( lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then( - lambda x, y: gr.update(choices=chat.find_all_histories(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id')) + lambda x, y: gr.update(choices=chat.find_all_histories_with_first_prompts(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id')) shared.gradio['rename_to'].submit( chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then( lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then( - lambda x, y: gr.update(choices=chat.find_all_histories(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id')) + lambda x, y: gr.update(choices=chat.find_all_histories_with_first_prompts(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id')) shared.gradio['load_chat_history'].upload( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.start_new_chat, gradio('interface_state'), gradio('history')).then( chat.load_history_json, gradio('load_chat_history', 'history'), gradio('history')).then( chat.redraw_html, gradio(reload_arr), gradio('display')).then( - lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then( + lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False).then( chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}') + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}') shared.gradio['character_menu'].change( chat.load_character, gradio('character_menu', 'name1', 'name2'), gradio('name1', 'name2', 'character_picture', 'greeting', 'context')).success( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.load_latest_history, gradio('interface_state'), gradio('history')).then( chat.redraw_html, gradio(reload_arr), gradio('display')).then( - lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then( - lambda: None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}') + lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False).then( + None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}') shared.gradio['mode'].change( - lambda x: [gr.update(visible=x != 'instruct'), gr.update(visible=x == 'chat-instruct')], gradio('mode'), gradio('chat_style', 'chat-instruct_command'), show_progress=False).then( + lambda x: [gr.update(visible=(x != 'instruct'))] * 4 + [gr.update(visible=(x == 'chat-instruct'))], gradio('mode'), gradio('character_menu', 'refresh_character', 'delete_character', 'chat_style', 'chat-instruct_command'), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.load_latest_history, gradio('interface_state'), gradio('history')).then( chat.redraw_html, gradio(reload_arr), gradio('display')).then( - lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')) + lambda x: gr.update(choices=(histories := chat.find_all_histories_with_first_prompts(x)), value=histories[0][1]), gradio('interface_state'), gradio('unique_id'), show_progress=False) shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display')) shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False) @@ -336,11 +338,11 @@ def create_event_handlers(): shared.gradio['Submit character'].click( chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}') + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}') shared.gradio['Submit tavern character'].click( chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}') + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}') shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character')) shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character')) @@ -354,28 +356,28 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then( partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('textbox-default')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}') + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}') shared.gradio['send_instruction_to_notebook'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then( partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('textbox-notebook')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}') + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}') shared.gradio['send_instruction_to_negative_prompt'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then( partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('negative_prompt')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}') + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}') shared.gradio['send-chat-to-default'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-default')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}') + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}') shared.gradio['send-chat-to-notebook'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-notebook')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}') + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}') - shared.gradio['show_controls'].change(lambda x: None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}') + shared.gradio['show_controls'].change(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}') diff --git a/modules/ui_default.py b/modules/ui_default.py index 1f962551..bf9800f6 100644 --- a/modules/ui_default.py +++ b/modules/ui_default.py @@ -67,21 +67,21 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['textbox-default'].submit( lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['markdown_render-default'].click(lambda x: x, gradio('output_textbox'), gradio('markdown-default'), queue=False) shared.gradio['Continue-default'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, [shared.gradio['output_textbox']] + gradio(inputs)[1:], gradio(outputs), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False) shared.gradio['prompt_menu-default'].change(load_prompt, gradio('prompt_menu-default'), gradio('textbox-default'), show_progress=False) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index d8b53b11..3ebcd126 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -105,8 +105,8 @@ def create_ui(): shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len) with gr.Blocks(): shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) - shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base) - shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb) + shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=20000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base) + shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=0.1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb) shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') @@ -139,6 +139,7 @@ def create_ui(): shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') + shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.') shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') with gr.Blocks(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) @@ -147,9 +148,9 @@ def create_ui(): shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.') shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.') - shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.') shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.") + shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') with gr.Column(): with gr.Row(): diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py index a7c62baf..307bc0f3 100644 --- a/modules/ui_notebook.py +++ b/modules/ui_notebook.py @@ -67,14 +67,14 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['textbox-notebook'].submit( lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Undo'].click(lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False) shared.gradio['markdown_render-notebook'].click(lambda x: x, gradio('textbox-notebook'), gradio('markdown-notebook'), queue=False) @@ -83,7 +83,7 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Stop-notebook'].click(stop_everything_event, None, None, queue=False) shared.gradio['prompt_menu-notebook'].change(load_prompt, gradio('prompt_menu-notebook'), gradio('textbox-notebook'), show_progress=False) diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index d62b74c1..68512c7e 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -40,7 +40,6 @@ def create_ui(default_preset): shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample') with gr.Blocks(): - gr.Markdown("[DRY sequence repetition penalty](https://github.com/oobabooga/text-generation-webui/pull/5677)") shared.gradio['dry_multiplier'] = gr.Slider(0, 5, value=generate_params['dry_multiplier'], step=0.01, label='dry_multiplier', info='Set to value > 0 to enable DRY. Controls the magnitude of the penalty for the shortest penalized sequences.') shared.gradio['dry_base'] = gr.Slider(1, 4, value=generate_params['dry_base'], step=0.01, label='dry_base', info='Controls how fast the penalty grows with increasing sequence length.') shared.gradio['dry_allowed_length'] = gr.Slider(1, 20, value=generate_params['dry_allowed_length'], step=1, label='dry_allowed_length', info='Longest sequence that can be repeated without being penalized.') diff --git a/modules/ui_session.py b/modules/ui_session.py index 08929c33..087091ce 100644 --- a/modules/ui_session.py +++ b/modules/ui_session.py @@ -32,10 +32,10 @@ def create_ui(): # Reset interface event shared.gradio['reset_interface'].click( set_interface_arguments, gradio('extensions_menu', 'bool_menu'), None).then( - lambda: None, None, None, js='() => {document.body.innerHTML=\'