From ae02ffc605d90b19a507132c09055c7ff11df52b Mon Sep 17 00:00:00 2001 From: oobabooga Date: Sun, 20 Apr 2025 13:33:47 -0300 Subject: [PATCH 01/21] Refactor the transformers loader (#6859) --- extensions/openai/completions.py | 86 +++------- extensions/openai/script.py | 6 +- modules/LoRA.py | 4 +- modules/callbacks.py | 22 --- modules/evaluate.py | 7 +- modules/loaders.py | 18 +- modules/logits.py | 23 ++- modules/models.py | 283 ++----------------------------- modules/models_settings.py | 17 -- modules/sampler_hijack.py | 2 +- modules/shared.py | 4 +- modules/text_generation.py | 129 ++++++++------ modules/torch_utils.py | 37 ++++ modules/training.py | 17 +- modules/transformers_loader.py | 281 ++++++++++++++++++++++++++++++ modules/ui.py | 10 -- modules/ui_model_menu.py | 40 +---- server.py | 6 +- 18 files changed, 464 insertions(+), 528 deletions(-) create mode 100644 modules/torch_utils.py create mode 100644 modules/transformers_loader.py diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index f1a60645..75e2cc11 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -7,10 +7,7 @@ from io import BytesIO import requests import tiktoken -import torch -import torch.nn.functional as F from PIL import Image -from transformers import LogitsProcessor, LogitsProcessorList from extensions.openai.errors import InvalidRequestError from extensions.openai.utils import debug_msg @@ -22,54 +19,7 @@ from modules.chat import ( load_instruction_template_memoized ) from modules.presets import load_preset_memoized -from modules.text_generation import ( - decode, - encode, - generate_reply, - get_reply_from_output_ids -) - - -class LogitsBiasProcessor(LogitsProcessor): - def __init__(self, logit_bias={}): - self.logit_bias = logit_bias - if self.logit_bias: - self.keys = list([int(key) for key in self.logit_bias.keys()]) - values = [self.logit_bias[str(key)] for key in self.keys] - self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device) - debug_msg(f"{self})") - - def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor: - if self.logit_bias: - debug_msg(logits[0, self.keys], " + ", self.values) - logits[0, self.keys] += self.values - debug_msg(" --> ", logits[0, self.keys]) - debug_msg(" max/min ", float(torch.max(logits[0])), float(torch.min(logits[0]))) - - return logits - - def __repr__(self): - return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>" - - -class LogprobProcessor(LogitsProcessor): - def __init__(self, logprobs=None): - self.logprobs = logprobs - self.token_alternatives = {} - - def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor: - if self.logprobs is not None: # 0-5 - log_e_probabilities = F.log_softmax(logits, dim=1) - top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1) - top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]] - top_probs = [float(x) for x in top_values[0]] - self.token_alternatives = dict(zip(top_tokens, top_probs)) - debug_msg(repr(self)) - - return logits - - def __repr__(self): - return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>" +from modules.text_generation import decode, encode, generate_reply def convert_logprobs_to_tiktoken(model, logprobs): @@ -107,21 +57,29 @@ def process_parameters(body, is_legacy=False): elif isinstance(body['stop'], list): generate_params['custom_stopping_strings'] = body['stop'] - logits_processor = [] - logit_bias = body.get('logit_bias', None) - if logit_bias: # {str: float, ...} - logits_processor = [LogitsBiasProcessor(logit_bias)] + if shared.args.loader != 'llama.cpp': + from transformers import LogitsProcessorList - logprobs = None # coming to chat eventually - if 'logprobs' in body: - logprobs = body.get('logprobs', 0) # maybe cap at topk? don't clamp 0-5. - generate_params['logprob_proc'] = LogprobProcessor(logprobs) - logits_processor.extend([generate_params['logprob_proc']]) - else: - logprobs = None + from modules.transformers_loader import ( + LogitsBiasProcessor, + LogprobProcessor + ) - if logits_processor: # requires logits_processor support - generate_params['logits_processor'] = LogitsProcessorList(logits_processor) + logits_processor = [] + logit_bias = body.get('logit_bias', None) + if logit_bias: # {str: float, ...} + logits_processor = [LogitsBiasProcessor(logit_bias)] + + logprobs = None # coming to chat eventually + if 'logprobs' in body: + logprobs = body.get('logprobs', 0) # maybe cap at topk? don't clamp 0-5. + generate_params['logprob_proc'] = LogprobProcessor(logprobs) + logits_processor.extend([generate_params['logprob_proc']]) + else: + logprobs = None + + if logits_processor: # requires logits_processor support + generate_params['logits_processor'] = LogitsProcessorList(logits_processor) return generate_params diff --git a/extensions/openai/script.py b/extensions/openai/script.py index f23caf9b..2ce692c7 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -16,11 +16,9 @@ from pydub import AudioSegment from sse_starlette import EventSourceResponse import extensions.openai.completions as OAIcompletions -import extensions.openai.embeddings as OAIembeddings import extensions.openai.images as OAIimages import extensions.openai.logits as OAIlogits import extensions.openai.models as OAImodels -import extensions.openai.moderations as OAImoderations from extensions.openai.errors import ServiceUnavailableError from extensions.openai.tokens import token_count, token_decode, token_encode from extensions.openai.utils import _start_cloudflared @@ -211,6 +209,8 @@ async def handle_image_generation(request: Request): @app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key) async def handle_embeddings(request: Request, request_data: EmbeddingsRequest): + import extensions.openai.embeddings as OAIembeddings + input = request_data.input if not input: raise HTTPException(status_code=400, detail="Missing required argument input") @@ -224,6 +224,8 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest): @app.post("/v1/moderations", dependencies=check_key) async def handle_moderations(request: Request): + import extensions.openai.moderations as OAImoderations + body = await request.json() input = body["input"] if not input: diff --git a/modules/LoRA.py b/modules/LoRA.py index 1f4883e2..a9e9a895 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -2,7 +2,6 @@ from pathlib import Path import modules.shared as shared from modules.logging_colors import logger -from modules.models import get_device def add_lora_to_model(lora_names): @@ -47,9 +46,10 @@ def add_lora_exllamav2(lora_names): def add_lora_transformers(lora_names): - from peft import PeftModel + from modules.torch_utils import get_device + prior_set = set(shared.lora_names) added_set = set(lora_names) - prior_set removed_set = prior_set - set(lora_names) diff --git a/modules/callbacks.py b/modules/callbacks.py index 0f918f3d..082365f0 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -2,9 +2,6 @@ import traceback from queue import Queue from threading import Thread -import torch -import transformers - import modules.shared as shared @@ -12,25 +9,6 @@ class StopNowException(Exception): pass -class _StopEverythingStoppingCriteria(transformers.StoppingCriteria): - def __init__(self): - transformers.StoppingCriteria.__init__(self) - - def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool: - return shared.stop_everything - - -class Stream(transformers.StoppingCriteria): - def __init__(self, callback_func=None): - self.callback_func = callback_func - - def __call__(self, input_ids, scores) -> bool: - if self.callback_func is not None: - self.callback_func(input_ids[0]) - - return False - - class Iteratorize: """ diff --git a/modules/evaluate.py b/modules/evaluate.py index dd7ef9a7..bbf27da1 100644 --- a/modules/evaluate.py +++ b/modules/evaluate.py @@ -2,13 +2,12 @@ import datetime from pathlib import Path import pandas as pd -import torch from datasets import load_dataset from tqdm import tqdm from modules import shared from modules.logging_colors import logger -from modules.models import clear_torch_cache, load_model, unload_model +from modules.models import load_model, unload_model from modules.models_settings import get_model_metadata, update_model_parameters from modules.text_generation import encode @@ -39,6 +38,10 @@ def calculate_perplexity(models, input_dataset, stride, _max_length): https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models ''' + import torch + + from modules.torch_utils import clear_torch_cache + if shared.args.loader == "llama.cpp": logger.error("Perplexity evaluation is not implemented for the llama.cpp loader.") raise ValueError diff --git a/modules/loaders.py b/modules/loaders.py index aaf4c8ed..f69f1720 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -3,11 +3,9 @@ from collections import OrderedDict import gradio as gr -from modules import shared - loaders_and_params = OrderedDict({ 'Transformers': [ - 'gpu_memory', + 'gpu_split', 'cpu_memory', 'alpha_value', 'compress_pos_emb', @@ -17,7 +15,6 @@ loaders_and_params = OrderedDict({ 'load_in_4bit', 'torch_compile', 'use_flash_attention_2', - 'auto_devices', 'cpu', 'disk', 'use_double_quant', @@ -346,10 +343,6 @@ def blacklist_samplers(loader, dynamic_temperature): return output -def get_gpu_memory_keys(): - return [k for k in shared.gradio if k.startswith('gpu_memory')] - - @functools.cache def get_all_params(): all_params = set() @@ -357,11 +350,6 @@ def get_all_params(): for el in loaders_and_params[k]: all_params.add(el) - if 'gpu_memory' in all_params: - all_params.remove('gpu_memory') - for k in get_gpu_memory_keys(): - all_params.add(k) - return sorted(all_params) @@ -371,8 +359,4 @@ def make_loader_params_visible(loader): if loader in loaders_and_params: params = loaders_and_params[loader] - if 'gpu_memory' in params: - params.remove('gpu_memory') - params += get_gpu_memory_keys() - return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params] diff --git a/modules/logits.py b/modules/logits.py index 020e7424..9a4243ff 100644 --- a/modules/logits.py +++ b/modules/logits.py @@ -2,11 +2,10 @@ import time import traceback import numpy as np -import torch -from modules import models, sampler_hijack, shared +from modules import models, shared from modules.logging_colors import logger -from modules.models import get_device, load_model +from modules.models import load_model from modules.text_generation import generate_reply global_scores = None @@ -38,18 +37,16 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur logger.error("No model is loaded! Select one in the Model tab.") return 'Error: No model is loaded1 Select one in the Model tab.', previous - is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model' - is_llamacpp = shared.model.__class__.__name__ == 'LlamaServer' - - if is_llamacpp: + # llama.cpp case + if shared.model.__class__.__name__ == 'LlamaServer': logprobs = shared.model.get_logits(prompt, state, n_probs=top_logits, use_samplers=use_samplers) + if return_dict: output = {} for entry in logprobs: token = repr(entry['token']) prob = entry['prob'] if use_samplers else np.exp(entry['logprob']) output[token] = prob - return output else: output = '' @@ -57,9 +54,17 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur token = repr(entry['token']) prob = entry['prob'] if use_samplers else np.exp(entry['logprob']) output += f"{prob:.5f} - {token}\n" - return output, previous + + # All other model types else: + import torch + + from modules import sampler_hijack + from modules.torch_utils import get_device + + is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model' + if not use_samplers: state = {'stream': True} diff --git a/modules/models.py b/modules/models.py index 48d92bd5..c4dfa149 100644 --- a/modules/models.py +++ b/modules/models.py @@ -1,61 +1,10 @@ -import gc -import os -import pprint -import re import time from pathlib import Path -import torch -import transformers -from accelerate import infer_auto_device_map, init_empty_weights -from accelerate.utils import ( - is_ccl_available, - is_npu_available, - is_xpu_available -) -from transformers import ( - AutoConfig, - AutoModel, - AutoModelForCausalLM, - AutoModelForSeq2SeqLM, - AutoTokenizer, - BitsAndBytesConfig, - is_torch_npu_available, - is_torch_xpu_available -) - import modules.shared as shared from modules.logging_colors import logger from modules.models_settings import get_model_metadata -transformers.logging.set_verbosity_error() - -local_rank = None -if shared.args.deepspeed: - import deepspeed - from transformers.integrations.deepspeed import ( - HfDeepSpeedConfig, - is_deepspeed_zero3_enabled - ) - - from modules.deepspeed_parameters import generate_ds_config - - # Distributed setup - local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0")) - world_size = int(os.getenv("WORLD_SIZE", "1")) - if is_xpu_available() and is_ccl_available(): - torch.xpu.set_device(local_rank) - deepspeed.init_distributed(backend="ccl") - elif is_npu_available(): - torch.npu.set_device(local_rank) - deepspeed.init_distributed(dist_backend="hccl") - else: - torch.cuda.set_device(local_rank) - deepspeed.init_distributed() - ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir) - dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration - - last_generation_time = time.time() @@ -66,8 +15,8 @@ def load_model(model_name, loader=None): shared.is_seq2seq = False shared.model_name = model_name load_func_map = { - 'Transformers': huggingface_loader, 'llama.cpp': llama_cpp_server_loader, + 'Transformers': transformers_loader, 'ExLlamav3_HF': ExLlamav3_HF_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ExLlamav2': ExLlamav2_loader, @@ -86,7 +35,6 @@ def load_model(model_name, loader=None): raise ValueError shared.args.loader = loader - clear_torch_cache() output = load_func_map[loader](model_name) if type(output) is tuple: model, tokenizer = output @@ -95,6 +43,7 @@ def load_model(model_name, loader=None): if model is None: return None, None else: + from modules.transformers_loader import load_tokenizer tokenizer = load_tokenizer(model_name) shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) @@ -110,163 +59,6 @@ def load_model(model_name, loader=None): return model, tokenizer -def load_tokenizer(model_name, tokenizer_dir=None): - if tokenizer_dir: - path_to_model = Path(tokenizer_dir) - else: - path_to_model = Path(f"{shared.args.model_dir}/{model_name}/") - - tokenizer = None - if path_to_model.exists(): - if shared.args.no_use_fast: - logger.info('Loading the tokenizer with use_fast=False.') - - tokenizer = AutoTokenizer.from_pretrained( - path_to_model, - trust_remote_code=shared.args.trust_remote_code, - use_fast=not shared.args.no_use_fast - ) - - return tokenizer - - -def huggingface_loader(model_name): - path_to_model = Path(f'{shared.args.model_dir}/{model_name}') - params = { - 'low_cpu_mem_usage': True, - 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16, - } - - if shared.args.trust_remote_code: - params['trust_remote_code'] = True - - if shared.args.use_flash_attention_2: - params['use_flash_attention_2'] = True - - if shared.args.force_safetensors: - params['force_safetensors'] = True - - if shared.args.use_eager_attention: - params['attn_implementation'] = 'eager' - - config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) - - if 'chatglm' in model_name.lower(): - LoaderClass = AutoModel - else: - if config.to_dict().get('is_encoder_decoder', False): - LoaderClass = AutoModelForSeq2SeqLM - shared.is_seq2seq = True - else: - LoaderClass = AutoModelForCausalLM - - # Determine if we should use default loading - should_use_default_loading = not any([ - shared.args.cpu, - shared.args.load_in_8bit, - shared.args.load_in_4bit, - shared.args.auto_devices, - shared.args.disk, - shared.args.deepspeed, - shared.args.gpu_memory is not None, - shared.args.cpu_memory is not None, - shared.args.compress_pos_emb > 1, - shared.args.alpha_value > 1, - ]) - - # Load the model without any special settings - if should_use_default_loading: - logger.info("TRANSFORMERS_PARAMS=") - pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params) - print() - - model = LoaderClass.from_pretrained(path_to_model, **params) - if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit): - device = get_device() - if device: - model = model.to(device) - - # DeepSpeed ZeRO-3 - elif shared.args.deepspeed: - model = LoaderClass.from_pretrained( - path_to_model, - torch_dtype=params['torch_dtype'], - trust_remote_code=params.get('trust_remote_code') - ) - - model = deepspeed.initialize( - model=model, - config_params=ds_config, - model_parameters=None, - optimizer=None, - lr_scheduler=None - )[0] - - model.module.eval() # Inference - logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}') - - # Load with quantization and/or offloading - else: - if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())): - logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.') - shared.args.cpu = True - - if shared.args.cpu: - params['torch_dtype'] = torch.float32 - else: - params['device_map'] = 'auto' - if x := get_max_memory_dict(): - params['max_memory'] = x - - if shared.args.load_in_4bit: - # See https://github.com/huggingface/transformers/pull/23479/files - # and https://huggingface.co/blog/4bit-transformers-bitsandbytes - quantization_config_params = { - 'load_in_4bit': True, - 'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None, - 'bnb_4bit_quant_type': shared.args.quant_type, - 'bnb_4bit_use_double_quant': shared.args.use_double_quant, - 'llm_int8_enable_fp32_cpu_offload': True - } - params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params) - - elif shared.args.load_in_8bit: - if shared.args.auto_devices or shared.args.gpu_memory: - params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True) - else: - params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True) - - if params.get('max_memory') is not None: - with init_empty_weights(): - model = LoaderClass.from_config(config, trust_remote_code=params.get('trust_remote_code')) - - model.tie_weights() - params['device_map'] = infer_auto_device_map( - model, - dtype=torch.int8, - max_memory=params.get('max_memory'), - no_split_module_classes=model._no_split_modules - ) - - if shared.args.disk: - params['offload_folder'] = shared.args.disk_cache_dir - - if shared.args.compress_pos_emb > 1: - params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb} - elif shared.args.alpha_value > 1: - params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value} - - logger.info("TRANSFORMERS_PARAMS=") - pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params) - print() - model = LoaderClass.from_pretrained(path_to_model, **params) - - if shared.args.torch_compile: - model = torch.compile(model) - - return model - - def llama_cpp_server_loader(model_name): from modules.llama_cpp_server import LlamaServer @@ -284,6 +76,11 @@ def llama_cpp_server_loader(model_name): logger.error(f"Error loading the model with llama.cpp: {str(e)}") +def transformers_loader(model_name): + from modules.transformers_loader import load_model_HF + return load_model_HF(model_name) + + def ExLlamav3_HF_loader(model_name): from modules.exllamav3_hf import Exllamav3HF @@ -328,71 +125,15 @@ def TensorRT_LLM_loader(model_name): return model -def get_max_memory_dict(): - max_memory = {} - max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' - if shared.args.gpu_memory: - memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory)) - for i in range(len(memory_map)): - max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i] - - max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory - - # If --auto-devices is provided standalone, try to get a reasonable value - # for the maximum memory of device :0 - elif shared.args.auto_devices: - if is_xpu_available(): - total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024)) - else: - total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024)) - - suggestion = round((total_mem - 1000) / 1000) * 1000 - if total_mem - suggestion < 800: - suggestion -= 1000 - - suggestion = int(round(suggestion / 1000)) - logger.warning(f"Auto-assiging --gpu-memory {suggestion} for your GPU to try to prevent out-of-memory errors. You can manually set other values.") - max_memory[0] = f'{suggestion}GiB' - max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory - - return max_memory if len(max_memory) > 0 else None - - -def get_device(): - if torch.cuda.is_available(): - return torch.device('cuda') - elif shared.args.deepspeed: - import deepspeed - return deepspeed.get_accelerator().current_device_name() - elif torch.backends.mps.is_available(): - return torch.device('mps') - elif is_torch_xpu_available(): - return torch.device('xpu:0') - elif is_torch_npu_available(): - return torch.device('npu:0') - else: - return None - - -def clear_torch_cache(): - gc.collect() - if not shared.args.cpu: - if torch.cuda.is_available(): - torch.cuda.empty_cache() - elif is_xpu_available(): - torch.xpu.empty_cache() - elif is_npu_available(): - torch.npu.empty_cache() - elif torch.backends.mps.is_available(): - if hasattr(torch.backends.mps, 'empty_cache'): - torch.backends.mps.empty_cache() - - def unload_model(keep_model_name=False): + is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer') + shared.model = shared.tokenizer = None shared.lora_names = [] shared.model_dirty_from_training = False - clear_torch_cache() + if not is_llamacpp: + from modules.torch_utils import clear_torch_cache + clear_torch_cache() if not keep_model_name: shared.model_name = 'None' diff --git a/modules/models_settings.py b/modules/models_settings.py index 0af89d2c..f5f3be7a 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -188,17 +188,12 @@ def update_model_parameters(state, initial=False): UI: update the command-line arguments based on the interface values ''' elements = ui.list_model_elements() # the names of the parameters - gpu_memories = [] for i, element in enumerate(elements): if element not in state: continue value = state[element] - if element.startswith('gpu_memory'): - gpu_memories.append(value) - continue - if initial and element in shared.provided_arguments: continue @@ -211,18 +206,6 @@ def update_model_parameters(state, initial=False): setattr(shared.args, element, value) - found_positive = False - for i in gpu_memories: - if i > 0: - found_positive = True - break - - if not (initial and vars(shared.args)['gpu_memory'] != vars(shared.args_defaults)['gpu_memory']): - if found_positive: - shared.args.gpu_memory = [f"{i}MiB" for i in gpu_memories] - else: - shared.args.gpu_memory = None - def apply_model_settings_to_state(model, state): ''' diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py index e6883289..ee871a6e 100644 --- a/modules/sampler_hijack.py +++ b/modules/sampler_hijack.py @@ -13,7 +13,7 @@ from transformers.generation.logits_process import ( from modules import shared from modules.logging_colors import logger -from modules.models import get_device +from modules.torch_utils import get_device global_scores = None diff --git a/modules/shared.py b/modules/shared.py index 641b3f48..fd0f226e 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -91,9 +91,7 @@ group.add_argument('--loader', type=str, help='Choose the model loader manually, # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.') -group.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.') -group.add_argument('--gpu-memory', type=str, nargs='+', help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.') -group.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.') +group.add_argument('--cpu-memory', type=float, default=0, help='Maximum CPU memory in GiB. Use this for CPU offloading.') group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.') group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".') group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).') diff --git a/modules/text_generation.py b/modules/text_generation.py index 16aba3cb..585e4d9d 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -7,33 +7,18 @@ import time import traceback import numpy as np -import torch -import transformers -from transformers import ( - LogitsProcessorList, - is_torch_npu_available, - is_torch_xpu_available -) import modules.shared as shared -from modules import models, sampler_hijack -from modules.callbacks import ( - Iteratorize, - Stream, - _StopEverythingStoppingCriteria -) +from modules import models +from modules.callbacks import Iteratorize from modules.extensions import apply_extensions -from modules.grammar.grammar_utils import initialize_grammar -from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor from modules.html_generator import generate_basic_html from modules.logging_colors import logger -from modules.models import clear_torch_cache, get_device, load_model - -sampler_hijack.hijack_samplers() def generate_reply(*args, **kwargs): if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']: + from modules.models import load_model shared.model, shared.tokenizer = load_model(shared.model_name) shared.generation_lock.acquire() @@ -46,7 +31,6 @@ def generate_reply(*args, **kwargs): def _generate_reply(question, state, stopping_strings=None, is_chat=False, escape_html=False, for_ui=False): - # Find the appropriate generation function generate_func = apply_extensions('custom_generate_reply') if generate_func is None: @@ -80,7 +64,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap all_stop_strings += st shared.stop_everything = False - seed = set_manual_seed(state['seed']) last_update = -1 reply = '' is_stream = state['stream'] @@ -93,7 +76,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap min_update_interval = 1 / state['max_updates_second'] # Generate - for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat): + for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat): reply, stop_found = apply_stopping_strings(reply, all_stop_strings) if escape_html: reply = html.escape(reply) @@ -132,44 +115,55 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if shared.tokenizer is None: raise ValueError('No tokenizer is loaded') - if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel']: - if shared.model.__class__.__name__ == 'LlamaServer': - input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token) - else: + # llama.cpp case + if shared.model.__class__.__name__ == 'LlamaServer': + input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token) + input_ids = np.array(input_ids).reshape(1, len(input_ids)) + + if truncation_length is not None: + input_ids = input_ids[:, -truncation_length:] + + return input_ids + + # All other model types + else: + import torch + + from modules.torch_utils import get_device + + if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel']: input_ids = shared.tokenizer.encode(str(prompt)) + if shared.model.__class__.__name__ != 'Exllamav2Model': + input_ids = np.array(input_ids).reshape(1, len(input_ids)) + else: + input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens) - if shared.model.__class__.__name__ not in ['Exllamav2Model']: - input_ids = np.array(input_ids).reshape(1, len(input_ids)) - else: - input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens) + if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None: + if add_bos_token: + # Add BOS token if missing + if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0: + bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]]) + input_ids = torch.cat((bos_tensor, input_ids), 1) - if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None: - if add_bos_token: - if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0: - # Add a missing bos token (it may not have been added due to faulty model metadata) - bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]]) - input_ids = torch.cat((bos_tensor, input_ids), 1) + # Prevent double BOS tokens from jinja templates + while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id: + input_ids = input_ids[:, 1:] + else: + # Remove BOS tokens when not wanted + while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id: + input_ids = input_ids[:, 1:] - # Prevent double bos token due to jinja templates with somewhere - while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id: - input_ids = input_ids[:, 1:] - else: - # Remove any bos token that may have been added - while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id: - input_ids = input_ids[:, 1:] + if truncation_length is not None: + input_ids = input_ids[:, -truncation_length:] - # Handling truncation - if truncation_length is not None: - input_ids = input_ids[:, -truncation_length:] + if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu: + return input_ids + else: + device = get_device() + if device: + return input_ids.to(device) - if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu: - return input_ids - else: - device = get_device() - if device: - return input_ids.to(device) - - return input_ids + return input_ids def decode(output_ids, skip_special_tokens=True): @@ -221,6 +215,9 @@ def formatted_outputs(reply, model_name): def set_manual_seed(seed): + import torch + from transformers import is_torch_npu_available, is_torch_xpu_available + seed = int(seed) if seed == -1: seed = random.randint(1, 2**31) @@ -285,10 +282,26 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0): return reply -def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False): +def generate_reply_HF(question, original_question, state, stopping_strings=None, is_chat=False): + import torch + import transformers + from transformers import LogitsProcessorList + + from modules.grammar.grammar_utils import initialize_grammar + from modules.grammar.logits_process import ( + GrammarConstrainedLogitsProcessor + ) + from modules.torch_utils import clear_torch_cache, get_device + from modules.transformers_loader import ( + Stream, + _StopEverythingStoppingCriteria + ) + if shared.args.loader == 'Transformers': clear_torch_cache() + seed = set_manual_seed(state['seed']) + generate_params = {} for k in [ 'temperature', @@ -458,11 +471,15 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings return -def generate_reply_custom(question, original_question, seed, state, stopping_strings=None, is_chat=False): +def generate_reply_custom(question, original_question, state, stopping_strings=None, is_chat=False): """ For models that do not use the transformers library for sampling """ - seed = set_manual_seed(state['seed']) + + seed = state['seed'] + if shared.args.loader != 'llama.cpp': + print(shared.args.loader) + seed = set_manual_seed(seed) t0 = time.time() reply = '' diff --git a/modules/torch_utils.py b/modules/torch_utils.py new file mode 100644 index 00000000..ad9b26ad --- /dev/null +++ b/modules/torch_utils.py @@ -0,0 +1,37 @@ +import gc + +import torch +from accelerate.utils import is_npu_available, is_xpu_available +from transformers import is_torch_npu_available, is_torch_xpu_available + +from modules import shared + + +def get_device(): + if torch.cuda.is_available(): + return torch.device('cuda') + elif shared.args.deepspeed: + import deepspeed + return deepspeed.get_accelerator().current_device_name() + elif torch.backends.mps.is_available(): + return torch.device('mps') + elif is_torch_xpu_available(): + return torch.device('xpu:0') + elif is_torch_npu_available(): + return torch.device('npu:0') + else: + return None + + +def clear_torch_cache(): + gc.collect() + if not shared.args.cpu: + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif is_xpu_available(): + torch.xpu.empty_cache() + elif is_npu_available(): + torch.npu.empty_cache() + elif torch.backends.mps.is_available(): + if hasattr(torch.backends.mps, 'empty_cache'): + torch.backends.mps.empty_cache() diff --git a/modules/training.py b/modules/training.py index 11c4b8c5..c6c380a3 100644 --- a/modules/training.py +++ b/modules/training.py @@ -15,13 +15,6 @@ from datetime import datetime from pathlib import Path import gradio as gr -import torch -import transformers -from datasets import Dataset, load_dataset -from transformers import is_torch_xpu_available -from transformers.models.auto.modeling_auto import ( - MODEL_FOR_CAUSAL_LM_MAPPING_NAMES -) from modules import shared, ui, utils from modules.evaluate import ( @@ -33,7 +26,6 @@ from modules.logging_colors import logger from modules.models import reload_model from modules.utils import natural_keys -MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()} PARAMETERS = ["lora_name", "always_override", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"] WANT_INTERRUPT = False @@ -284,6 +276,9 @@ def calc_trainable_parameters(model): def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str): + import torch + import transformers + from datasets import Dataset, load_dataset from peft import ( LoraConfig, get_peft_model, @@ -293,6 +288,12 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: from peft.utils.other import \ TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as \ model_to_lora_modules + from transformers import is_torch_xpu_available + from transformers.models.auto.modeling_auto import ( + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + ) + + MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()} global WANT_INTERRUPT WANT_INTERRUPT = False diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py new file mode 100644 index 00000000..5512f061 --- /dev/null +++ b/modules/transformers_loader.py @@ -0,0 +1,281 @@ +import os +import pprint +from pathlib import Path + +import torch +import torch.nn.functional as F +import transformers +from accelerate import infer_auto_device_map, init_empty_weights +from accelerate.utils import ( + is_ccl_available, + is_npu_available, + is_xpu_available +) +from transformers import ( + AutoConfig, + AutoModel, + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + BitsAndBytesConfig, + LogitsProcessor +) + +import modules.shared as shared +from modules import sampler_hijack +from modules.logging_colors import logger +from modules.text_generation import get_reply_from_output_ids +from modules.torch_utils import get_device + +transformers.logging.set_verbosity_error() +sampler_hijack.hijack_samplers() + +local_rank = None +if shared.args.deepspeed: + import deepspeed + from transformers.integrations.deepspeed import ( + HfDeepSpeedConfig, + is_deepspeed_zero3_enabled + ) + + from modules.deepspeed_parameters import generate_ds_config + + # Distributed setup + local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0")) + world_size = int(os.getenv("WORLD_SIZE", "1")) + if is_xpu_available() and is_ccl_available(): + torch.xpu.set_device(local_rank) + deepspeed.init_distributed(backend="ccl") + elif is_npu_available(): + torch.npu.set_device(local_rank) + deepspeed.init_distributed(dist_backend="hccl") + else: + torch.cuda.set_device(local_rank) + deepspeed.init_distributed() + ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir) + dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration + + +class _StopEverythingStoppingCriteria(transformers.StoppingCriteria): + def __init__(self): + transformers.StoppingCriteria.__init__(self) + + def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool: + return shared.stop_everything + + +class Stream(transformers.StoppingCriteria): + def __init__(self, callback_func=None): + self.callback_func = callback_func + + def __call__(self, input_ids, scores) -> bool: + if self.callback_func is not None: + self.callback_func(input_ids[0]) + + return False + + +class LogitsBiasProcessor(LogitsProcessor): + def __init__(self, logit_bias={}): + self.logit_bias = logit_bias + if self.logit_bias: + self.keys = list([int(key) for key in self.logit_bias.keys()]) + values = [self.logit_bias[str(key)] for key in self.keys] + self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device) + + def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor: + if self.logit_bias: + logits[0, self.keys] += self.values + + return logits + + def __repr__(self): + return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>" + + +class LogprobProcessor(LogitsProcessor): + def __init__(self, logprobs=None): + self.logprobs = logprobs + self.token_alternatives = {} + + def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor: + if self.logprobs is not None: # 0-5 + log_e_probabilities = F.log_softmax(logits, dim=1) + top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1) + top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]] + top_probs = [float(x) for x in top_values[0]] + self.token_alternatives = dict(zip(top_tokens, top_probs)) + + return logits + + def __repr__(self): + return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>" + + +def load_tokenizer(model_name, tokenizer_dir=None): + if tokenizer_dir: + path_to_model = Path(tokenizer_dir) + else: + path_to_model = Path(f"{shared.args.model_dir}/{model_name}/") + + tokenizer = None + if path_to_model.exists(): + if shared.args.no_use_fast: + logger.info('Loading the tokenizer with use_fast=False.') + + tokenizer = AutoTokenizer.from_pretrained( + path_to_model, + trust_remote_code=shared.args.trust_remote_code, + use_fast=not shared.args.no_use_fast + ) + + return tokenizer + + +def load_model_HF(model_name): + path_to_model = Path(f'{shared.args.model_dir}/{model_name}') + params = { + 'low_cpu_mem_usage': True, + 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16, + } + + if shared.args.trust_remote_code: + params['trust_remote_code'] = True + + if shared.args.use_flash_attention_2: + params['use_flash_attention_2'] = True + + if shared.args.force_safetensors: + params['force_safetensors'] = True + + if shared.args.use_eager_attention: + params['attn_implementation'] = 'eager' + + config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) + + if 'chatglm' in model_name.lower(): + LoaderClass = AutoModel + else: + if config.to_dict().get('is_encoder_decoder', False): + LoaderClass = AutoModelForSeq2SeqLM + shared.is_seq2seq = True + else: + LoaderClass = AutoModelForCausalLM + + # Determine if we should use default loading + should_use_default_loading = not any([ + shared.args.cpu, + shared.args.load_in_8bit, + shared.args.load_in_4bit, + shared.args.disk, + shared.args.deepspeed, + shared.args.cpu_memory is not None, + shared.args.compress_pos_emb > 1, + shared.args.alpha_value > 1, + ]) + + # Load the model without any special settings + if should_use_default_loading: + params['device_map'] = 'auto' + + logger.info("TRANSFORMERS_PARAMS=") + pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params) + print() + + model = LoaderClass.from_pretrained(path_to_model, **params) + if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit): + device = get_device() + if device: + model = model.to(device) + + # DeepSpeed ZeRO-3 + elif shared.args.deepspeed: + model = LoaderClass.from_pretrained( + path_to_model, + torch_dtype=params['torch_dtype'], + trust_remote_code=params.get('trust_remote_code') + ) + + model = deepspeed.initialize( + model=model, + config_params=ds_config, + model_parameters=None, + optimizer=None, + lr_scheduler=None + )[0] + + model.module.eval() # Inference + logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}') + + # Load with quantization and/or offloading + else: + if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())): + logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.') + shared.args.cpu = True + + if shared.args.cpu: + params['torch_dtype'] = torch.float32 + else: + params['device_map'] = 'auto' + if x := get_max_memory_dict(): + params['max_memory'] = x + + if shared.args.load_in_4bit: + # See https://github.com/huggingface/transformers/pull/23479/files + # and https://huggingface.co/blog/4bit-transformers-bitsandbytes + quantization_config_params = { + 'load_in_4bit': True, + 'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None, + 'bnb_4bit_quant_type': shared.args.quant_type, + 'bnb_4bit_use_double_quant': shared.args.use_double_quant, + 'llm_int8_enable_fp32_cpu_offload': True + } + params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params) + + elif shared.args.load_in_8bit: + if shared.args.gpu_split: + params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True) + else: + params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True) + + if params.get('max_memory') is not None: + with init_empty_weights(): + model = LoaderClass.from_config(config, trust_remote_code=params.get('trust_remote_code')) + + model.tie_weights() + params['device_map'] = infer_auto_device_map( + model, + dtype=torch.int8, + max_memory=params.get('max_memory'), + no_split_module_classes=model._no_split_modules + ) + + if shared.args.disk: + params['offload_folder'] = shared.args.disk_cache_dir + + if shared.args.compress_pos_emb > 1: + params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb} + elif shared.args.alpha_value > 1: + params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value} + + logger.info("TRANSFORMERS_PARAMS=") + pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params) + print() + model = LoaderClass.from_pretrained(path_to_model, **params) + + if shared.args.torch_compile: + model = torch.compile(model) + + return model + + +def get_max_memory_dict(): + max_memory = {} + if shared.args.cpu_memory > 0: + max_memory['cpu'] = f'{shared.args.cpu_memory}GiB' + + if shared.args.gpu_split: + for i, memory in enumerate(shared.args.gpu_split.split(',')): + max_memory[i] = f'{memory}GiB' + + return max_memory if len(max_memory) > 0 else None diff --git a/modules/ui.py b/modules/ui.py index c36fe553..d5caaeaa 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -2,9 +2,7 @@ import copy from pathlib import Path import gradio as gr -import torch import yaml -from transformers import is_torch_xpu_available import extensions from modules import shared @@ -128,7 +126,6 @@ def list_model_elements(): 'torch_compile', 'flash_attn', 'use_flash_attention_2', - 'auto_devices', 'cpu', 'disk', 'row_split', @@ -150,13 +147,6 @@ def list_model_elements(): 'no_use_fast', ] - if is_torch_xpu_available(): - for i in range(torch.xpu.device_count()): - elements.append(f'gpu_memory_{i}') - else: - for i in range(torch.cuda.device_count()): - elements.append(f'gpu_memory_{i}') - return elements diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 8db48621..b4af771c 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -1,14 +1,9 @@ import importlib -import math -import re import traceback from functools import partial from pathlib import Path import gradio as gr -import psutil -import torch -from transformers import is_torch_npu_available, is_torch_xpu_available from modules import loaders, shared, ui, utils from modules.logging_colors import logger @@ -27,35 +22,6 @@ from modules.utils import gradio def create_ui(): mu = shared.args.multi_user - # Finding the default values for the GPU and CPU memories - total_mem = [] - if is_torch_xpu_available(): - for i in range(torch.xpu.device_count()): - total_mem.append(math.floor(torch.xpu.get_device_properties(i).total_memory / (1024 * 1024))) - elif is_torch_npu_available(): - for i in range(torch.npu.device_count()): - total_mem.append(math.floor(torch.npu.get_device_properties(i).total_memory / (1024 * 1024))) - else: - for i in range(torch.cuda.device_count()): - total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024))) - - default_gpu_mem = [] - if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0: - for i in shared.args.gpu_memory: - if 'mib' in i.lower(): - default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i))) - else: - default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000) - - while len(default_gpu_mem) < len(total_mem): - default_gpu_mem.append(0) - - total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024)) - if shared.args.cpu_memory is not None: - default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory) - else: - default_cpu_mem = 0 - with gr.Tab("Model", elem_id="model-tab"): with gr.Row(): with gr.Column(): @@ -80,10 +46,6 @@ def create_ui(): with gr.Blocks(): with gr.Row(): with gr.Column(): - for i in range(len(total_mem)): - shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i]) - - shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem) shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads) shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) @@ -94,6 +56,7 @@ def create_ui(): shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.') shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') + shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory) shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.') shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.') shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.') @@ -107,7 +70,6 @@ def create_ui(): shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.') shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') - shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices) shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.') shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.') diff --git a/server.py b/server.py index 1f227350..a4b6e3e3 100644 --- a/server.py +++ b/server.py @@ -1,11 +1,8 @@ import os import warnings -from modules import shared - -import accelerate # This early import makes Intel GPUs happy - import modules.one_click_installer_check +from modules import shared from modules.block_requests import OpenMonkeyPatch, RequestBlocker from modules.logging_colors import logger @@ -38,7 +35,6 @@ import yaml import modules.extensions as extensions_module from modules import ( - chat, training, ui, ui_chat, From 9c59acf8208cb0f70e739cbd05659fff37599369 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 20 Apr 2025 10:01:31 -0700 Subject: [PATCH 02/21] Remove the numba requirement (it's no longer used) --- requirements.txt | 1 - requirements_amd.txt | 1 - requirements_amd_noavx2.txt | 1 - requirements_apple_intel.txt | 1 - requirements_apple_silicon.txt | 1 - requirements_cpu_only.txt | 1 - requirements_cpu_only_noavx2.txt | 1 - requirements_noavx2.txt | 1 - requirements_nowheels.txt | 1 - 9 files changed, 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 607efda0..c2f1a8bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,6 @@ fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown -numba==0.59.* numpy==1.26.* pandas peft==0.15.* diff --git a/requirements_amd.txt b/requirements_amd.txt index b242d4ad..a1920f0d 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -6,7 +6,6 @@ fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown -numba==0.59.* numpy==1.26.* pandas peft==0.15.* diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index b6105209..92d4f152 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -6,7 +6,6 @@ fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown -numba==0.59.* numpy==1.26.* pandas peft==0.15.* diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt index ce730f63..6b6707ab 100644 --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -6,7 +6,6 @@ fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown -numba==0.59.* numpy==1.26.* pandas peft==0.15.* diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt index a7be282d..910fae3e 100644 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -6,7 +6,6 @@ fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown -numba==0.59.* numpy==1.26.* pandas peft==0.15.* diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index 2437c2ae..d34a4312 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -6,7 +6,6 @@ fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown -numba==0.59.* numpy==1.26.* pandas peft==0.15.* diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index cbaa8e96..cb3f3d6b 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -6,7 +6,6 @@ fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown -numba==0.59.* numpy==1.26.* pandas peft==0.15.* diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index cce27aa2..06352dc3 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -7,7 +7,6 @@ fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown -numba==0.59.* numpy==1.26.* pandas peft==0.15.* diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt index 3b61ca39..7b43e08d 100644 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -6,7 +6,6 @@ fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown -numba==0.59.* numpy==1.26.* pandas peft==0.15.* From b3bf7a885d6c3d7216c1d996c0eb7581e1413d81 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 20 Apr 2025 11:32:48 -0700 Subject: [PATCH 03/21] Fix ExLlamaV2_HF and ExLlamaV3_HF after ae02ffc605d90b19a507132c09055c7ff11df52b --- modules/exllamav2_hf.py | 15 ++++++++++----- modules/exllamav3_hf.py | 9 +++++++-- modules/models.py | 5 +++++ modules/transformers_loader.py | 2 -- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index 6486e438..b159d9ce 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -4,10 +4,6 @@ from pathlib import Path from typing import Any, Dict, Optional, Union import torch -from torch.nn import CrossEntropyLoss -from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel -from transformers.modeling_outputs import CausalLMOutputWithPast - from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, @@ -18,6 +14,15 @@ from exllamav2 import ( ExLlamaV2Cache_TP, ExLlamaV2Config ) +from torch.nn import CrossEntropyLoss +from transformers import ( + GenerationConfig, + GenerationMixin, + PretrainedConfig, + PreTrainedModel +) +from transformers.modeling_outputs import CausalLMOutputWithPast + from modules import shared from modules.logging_colors import logger @@ -28,7 +33,7 @@ except Exception: traceback.print_exc() -class Exllamav2HF(PreTrainedModel): +class Exllamav2HF(PreTrainedModel, GenerationMixin): def __init__(self, config: ExLlamaV2Config): super().__init__(PretrainedConfig()) self.ex_config = config diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index 0f742fa2..2d9c493a 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -6,7 +6,12 @@ from typing import Any, Dict, Optional, Union import torch from exllamav3 import Cache, Config, Model from torch.nn import CrossEntropyLoss -from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel +from transformers import ( + GenerationConfig, + GenerationMixin, + PretrainedConfig, + PreTrainedModel +) from transformers.modeling_outputs import CausalLMOutputWithPast from modules import shared @@ -19,7 +24,7 @@ except Exception: traceback.print_exc() -class Exllamav3HF(PreTrainedModel): +class Exllamav3HF(PreTrainedModel, GenerationMixin): def __init__(self, model_dir): super().__init__(PretrainedConfig()) self.generation_config = GenerationConfig() diff --git a/modules/models.py b/modules/models.py index c4dfa149..2c23462a 100644 --- a/modules/models.py +++ b/modules/models.py @@ -1,3 +1,4 @@ +import sys import time from pathlib import Path @@ -34,6 +35,10 @@ def load_model(model_name, loader=None): logger.error('The path to the model does not exist. Exiting.') raise ValueError + if loader != 'llama.cpp' and 'sampler_hijack' not in sys.modules: + from modules import sampler_hijack + sampler_hijack.hijack_samplers() + shared.args.loader = loader output = load_func_map[loader](model_name) if type(output) is tuple: diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py index 5512f061..add3be66 100644 --- a/modules/transformers_loader.py +++ b/modules/transformers_loader.py @@ -22,13 +22,11 @@ from transformers import ( ) import modules.shared as shared -from modules import sampler_hijack from modules.logging_colors import logger from modules.text_generation import get_reply_from_output_ids from modules.torch_utils import get_device transformers.logging.set_verbosity_error() -sampler_hijack.hijack_samplers() local_rank = None if shared.args.deepspeed: From d5e1bccef91466918c43e291d8bd90ede5ae5d9f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 20 Apr 2025 11:47:28 -0700 Subject: [PATCH 04/21] Remove the SpeechRecognition requirement --- extensions/openai/script.py | 3 ++- requirements.txt | 1 - requirements_amd.txt | 1 - requirements_amd_noavx2.txt | 1 - requirements_apple_intel.txt | 1 - requirements_apple_silicon.txt | 1 - requirements_cpu_only.txt | 1 - requirements_cpu_only_noavx2.txt | 1 - requirements_noavx2.txt | 1 - requirements_nowheels.txt | 1 - 10 files changed, 2 insertions(+), 10 deletions(-) diff --git a/extensions/openai/script.py b/extensions/openai/script.py index 2ce692c7..f907cdbb 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -6,7 +6,6 @@ import traceback from collections import deque from threading import Thread -import speech_recognition as sr import uvicorn from fastapi import Depends, FastAPI, Header, HTTPException from fastapi.middleware.cors import CORSMiddleware @@ -163,6 +162,8 @@ def handle_billing_usage(): @app.post('/v1/audio/transcriptions', dependencies=check_key) async def handle_audio_transcription(request: Request): + import speech_recognition as sr + r = sr.Recognizer() form = await request.form() diff --git a/requirements.txt b/requirements.txt index c2f1a8bd..9e55bb33 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,7 +25,6 @@ tqdm wandb # API -SpeechRecognition==3.10.0 flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken diff --git a/requirements_amd.txt b/requirements_amd.txt index a1920f0d..ba8bc80c 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -24,7 +24,6 @@ tqdm wandb # API -SpeechRecognition==3.10.0 flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index 92d4f152..d921fbb0 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -24,7 +24,6 @@ tqdm wandb # API -SpeechRecognition==3.10.0 flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt index 6b6707ab..0b92376a 100644 --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -24,7 +24,6 @@ tqdm wandb # API -SpeechRecognition==3.10.0 flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt index 910fae3e..3e7fcd02 100644 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -24,7 +24,6 @@ tqdm wandb # API -SpeechRecognition==3.10.0 flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index d34a4312..43d54534 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -24,7 +24,6 @@ tqdm wandb # API -SpeechRecognition==3.10.0 flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index cb3f3d6b..b4444443 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -24,7 +24,6 @@ tqdm wandb # API -SpeechRecognition==3.10.0 flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index 06352dc3..351ea83e 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -25,7 +25,6 @@ tqdm wandb # API -SpeechRecognition==3.10.0 flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt index 7b43e08d..2e631bf0 100644 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -24,7 +24,6 @@ tqdm wandb # API -SpeechRecognition==3.10.0 flask_cloudflared==0.0.14 sse-starlette==1.6.5 tiktoken From 8cfd7f976befb9c0780ae7db47a696a61bac089f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 20 Apr 2025 13:35:42 -0700 Subject: [PATCH 05/21] Revert "Remove the old --model-menu flag" This reverts commit 109de34e3b3187eb3f463bf463086a48444013a0. --- modules/shared.py | 1 + server.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/modules/shared.py b/modules/shared.py index fd0f226e..08268ae0 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -79,6 +79,7 @@ group.add_argument('--model', type=str, help='Name of the model to load by defau group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.') group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.') group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.') +group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.') group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.') group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.') group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.') diff --git a/server.py b/server.py index a4b6e3e3..1d261566 100644 --- a/server.py +++ b/server.py @@ -214,10 +214,28 @@ if __name__ == "__main__": if extension not in shared.args.extensions: shared.args.extensions.append(extension) + available_models = utils.get_available_models() + # Model defined through --model if shared.args.model is not None: shared.model_name = shared.args.model + # Select the model from a command-line menu + elif shared.args.model_menu: + if len(available_models) == 0: + logger.error('No models are available! Please download at least one.') + sys.exit(0) + else: + print('The following models are available:\n') + for i, model in enumerate(available_models): + print(f'{i+1}. {model}') + + print(f'\nWhich one do you want to load? 1-{len(available_models)}\n') + i = int(input()) - 1 + print() + + shared.model_name = available_models[i] + # If any model has been selected, load it if shared.model_name != 'None': p = Path(shared.model_name) From e243424ba12c1fe030dd5086f67571ad8a6970fa Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 20 Apr 2025 17:51:28 -0700 Subject: [PATCH 06/21] Fix an import --- modules/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/evaluate.py b/modules/evaluate.py index bbf27da1..ba0de378 100644 --- a/modules/evaluate.py +++ b/modules/evaluate.py @@ -2,7 +2,6 @@ import datetime from pathlib import Path import pandas as pd -from datasets import load_dataset from tqdm import tqdm from modules import shared @@ -39,6 +38,7 @@ def calculate_perplexity(models, input_dataset, stride, _max_length): ''' import torch + from datasets import load_dataset from modules.torch_utils import clear_torch_cache From 99588be5760702d413435c9167935f29832a6d06 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 20 Apr 2025 18:57:26 -0700 Subject: [PATCH 07/21] Organize one_click.py --- one_click.py | 183 +++++++++++++++++++++++++-------------------------- 1 file changed, 91 insertions(+), 92 deletions(-) diff --git a/one_click.py b/one_click.py index 99bdc41e..eff2ed9f 100644 --- a/one_click.py +++ b/one_click.py @@ -15,7 +15,6 @@ import sys # os.environ["HSA_OVERRIDE_GFX_VERSION"] = '10.3.0' # os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030' - # Define the required versions TORCH_VERSION = "2.6.0" TORCHVISION_VERSION = "0.21.0" @@ -62,6 +61,19 @@ def is_x86_64(): return platform.machine() == "x86_64" +def is_installed(): + site_packages_path = None + for sitedir in site.getsitepackages(): + if "site-packages" in sitedir and conda_env_path in sitedir: + site_packages_path = sitedir + break + + if site_packages_path: + return os.path.isfile(os.path.join(site_packages_path, 'torch', '__init__.py')) + else: + return os.path.isdir(conda_env_path) + + def cpu_has_avx2(): try: import cpuinfo @@ -104,44 +116,13 @@ def torch_version(): return torver -def update_pytorch_and_python(): - print_big_message("Checking for PyTorch updates.") - - # Update the Python version. Left here for future reference in case this becomes necessary. - # print_big_message("Checking for PyTorch and Python updates.") - # current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}" - # if current_python_version != PYTHON_VERSION: - # run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True) - - torver = torch_version() - base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}" - - if "+cu" in torver: - install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124" - elif "+rocm" in torver: - install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1" - elif "+cpu" in torver: - install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu" - elif "+cxx11" in torver: - intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10" - install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/" - else: - install_cmd = base_cmd - - run_cmd(install_cmd, assert_success=True, environment=True) +def get_current_commit(): + result = run_cmd("git rev-parse HEAD", capture_output=True, environment=True) + return result.stdout.decode('utf-8').strip() -def is_installed(): - site_packages_path = None - for sitedir in site.getsitepackages(): - if "site-packages" in sitedir and conda_env_path in sitedir: - site_packages_path = sitedir - break - - if site_packages_path: - return os.path.isfile(os.path.join(site_packages_path, 'torch', '__init__.py')) - else: - return os.path.isdir(conda_env_path) +def get_extensions_names(): + return [foldername for foldername in os.listdir('extensions') if os.path.isfile(os.path.join('extensions', foldername, 'requirements.txt'))] def check_env(): @@ -157,35 +138,11 @@ def check_env(): sys.exit(1) -def get_current_commit(): - result = run_cmd("git rev-parse HEAD", capture_output=True, environment=True) - return result.stdout.decode('utf-8').strip() - - def clear_cache(): run_cmd("conda clean -a -y", environment=True) run_cmd("python -m pip cache purge", environment=True) -def print_big_message(message): - message = message.strip() - lines = message.split('\n') - print("\n\n*******************************************************************") - for line in lines: - print("*", line) - - print("*******************************************************************\n\n") - - -def calculate_file_hash(file_path): - p = os.path.join(script_dir, file_path) - if os.path.isfile(p): - with open(p, 'rb') as f: - return hashlib.sha256(f.read()).hexdigest() - else: - return '' - - def run_cmd(cmd, assert_success=False, environment=False, capture_output=False, env=None): # Use the conda environment if environment: @@ -210,6 +167,25 @@ def run_cmd(cmd, assert_success=False, environment=False, capture_output=False, return result +def print_big_message(message): + message = message.strip() + lines = message.split('\n') + print("\n\n*******************************************************************") + for line in lines: + print("*", line) + + print("*******************************************************************\n\n") + + +def calculate_file_hash(file_path): + p = os.path.join(script_dir, file_path) + if os.path.isfile(p): + with open(p, 'rb') as f: + return hashlib.sha256(f.read()).hexdigest() + else: + return '' + + def generate_alphabetic_sequence(index): result = '' while index >= 0: @@ -238,6 +214,51 @@ def get_user_choice(question, options_dict): return choice +def update_pytorch_and_python(): + print_big_message("Checking for PyTorch updates.") + + # Update the Python version. Left here for future reference in case this becomes necessary. + # print_big_message("Checking for PyTorch and Python updates.") + # current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}" + # if current_python_version != PYTHON_VERSION: + # run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True) + + torver = torch_version() + base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}" + + if "+cu" in torver: + install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124" + elif "+rocm" in torver: + install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1" + elif "+cpu" in torver: + install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu" + elif "+cxx11" in torver: + intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10" + install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/" + else: + install_cmd = base_cmd + + run_cmd(install_cmd, assert_success=True, environment=True) + + +def clean_outdated_pytorch_cuda_dependencies(): + patterns = ["cu121", "cu122", "torch2.4"] + result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True) + matching_packages = [] + + for line in result.stdout.decode('utf-8').splitlines(): + if "==" in line: + pkg_name, version = line.split('==', 1) + if any(pattern in version for pattern in patterns): + matching_packages.append(pkg_name) + + if matching_packages: + print(f"\nUninstalling: {', '.join(matching_packages)}\n") + run_cmd(f"python -m pip uninstall -y {' '.join(matching_packages)}", assert_success=True, environment=True) + + return matching_packages + + def install_webui(): if os.path.isfile(state_file): os.remove(state_file) @@ -323,37 +344,6 @@ def install_webui(): update_requirements(initial_installation=True, pull=False) -def get_extensions_names(): - return [foldername for foldername in os.listdir('extensions') if os.path.isfile(os.path.join('extensions', foldername, 'requirements.txt'))] - - -def install_extensions_requirements(): - print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.") - extensions = get_extensions_names() - for i, extension in enumerate(extensions): - print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n") - extension_req_path = os.path.join("extensions", extension, "requirements.txt") - run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True) - - -def clean_outdated_pytorch_cuda_dependencies(): - patterns = ["cu121", "cu122", "torch2.4"] - result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True) - matching_packages = [] - - for line in result.stdout.decode('utf-8').splitlines(): - if "==" in line: - pkg_name, version = line.split('==', 1) - if any(pattern in version for pattern in patterns): - matching_packages.append(pkg_name) - - if matching_packages: - print(f"\nUninstalling: {', '.join(matching_packages)}\n") - run_cmd(f"python -m pip uninstall -y {' '.join(matching_packages)}", assert_success=True, environment=True) - - return matching_packages - - def update_requirements(initial_installation=False, pull=True): # Create .git directory if missing if not os.path.exists(os.path.join(script_dir, ".git")): @@ -475,6 +465,15 @@ def update_requirements(initial_installation=False, pull=True): clear_cache() +def install_extensions_requirements(): + print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.") + extensions = get_extensions_names() + for i, extension in enumerate(extensions): + print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n") + extension_req_path = os.path.join("extensions", extension, "requirements.txt") + run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True) + + def launch_webui(): run_cmd(f"python server.py {flags}", environment=True) From d3e7c655e5b4ce3fb15ecd7e27064f8df727ce14 Mon Sep 17 00:00:00 2001 From: Matthew Jenkins <40323108+Matthew-Jenkins@users.noreply.github.com> Date: Sun, 20 Apr 2025 22:06:24 -0400 Subject: [PATCH 08/21] Add support for llama-cpp builds from https://github.com/ggml-org/llama.cpp (#6862) --- modules/llama_cpp_server.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 02a56b3c..c88f945d 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -1,4 +1,5 @@ import json +import os import pprint import socket import subprocess @@ -281,12 +282,21 @@ class LlamaServer: if shared.args.rope_freq_base > 0: cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)] + env = os.environ.copy() + if os.name == 'posix': + current_path = env.get('LD_LIBRARY_PATH', '') + if current_path: + env['LD_LIBRARY_PATH'] = f"{current_path}:{os.path.dirname(self.server_path)}" + else: + env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path) + # Start the server with pipes for output self.process = subprocess.Popen( cmd, stderr=subprocess.PIPE, text=True, - bufsize=1 + bufsize=1, + env=env ) def filter_stderr(process_stderr): From ff1c00bdd993f1610add8b0d2eff208809506444 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 20 Apr 2025 19:08:44 -0700 Subject: [PATCH 09/21] llama.cpp: set the random seed manually --- modules/text_generation.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index 585e4d9d..40046eb2 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -215,20 +215,21 @@ def formatted_outputs(reply, model_name): def set_manual_seed(seed): - import torch - from transformers import is_torch_npu_available, is_torch_xpu_available - seed = int(seed) if seed == -1: seed = random.randint(1, 2**31) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - elif is_torch_xpu_available(): - torch.xpu.manual_seed_all(seed) - elif is_torch_npu_available(): - torch.npu.manual_seed_all(seed) + if shared.args.loader != 'llama.cpp': + import torch + from transformers import is_torch_npu_available, is_torch_xpu_available + + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + elif is_torch_xpu_available(): + torch.xpu.manual_seed_all(seed) + elif is_torch_npu_available(): + torch.npu.manual_seed_all(seed) return seed @@ -476,11 +477,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N For models that do not use the transformers library for sampling """ - seed = state['seed'] - if shared.args.loader != 'llama.cpp': - print(shared.args.loader) - seed = set_manual_seed(seed) - + seed = set_manual_seed(state['seed']) t0 = time.time() reply = '' try: From 6117ef7d64543f337abe6f0b33dca6b8e9294f9e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 20 Apr 2025 19:12:04 -0700 Subject: [PATCH 10/21] Move the requirements*.txt to a requirements folder --- requirements.txt => requirements/requirements.txt | 0 requirements_amd.txt => requirements/requirements_amd.txt | 0 .../requirements_amd_noavx2.txt | 0 .../requirements_apple_intel.txt | 0 .../requirements_apple_silicon.txt | 0 .../requirements_cpu_only.txt | 0 .../requirements_cpu_only_noavx2.txt | 0 requirements_noavx2.txt => requirements/requirements_noavx2.txt | 0 .../requirements_nowheels.txt | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename requirements.txt => requirements/requirements.txt (100%) rename requirements_amd.txt => requirements/requirements_amd.txt (100%) rename requirements_amd_noavx2.txt => requirements/requirements_amd_noavx2.txt (100%) rename requirements_apple_intel.txt => requirements/requirements_apple_intel.txt (100%) rename requirements_apple_silicon.txt => requirements/requirements_apple_silicon.txt (100%) rename requirements_cpu_only.txt => requirements/requirements_cpu_only.txt (100%) rename requirements_cpu_only_noavx2.txt => requirements/requirements_cpu_only_noavx2.txt (100%) rename requirements_noavx2.txt => requirements/requirements_noavx2.txt (100%) rename requirements_nowheels.txt => requirements/requirements_nowheels.txt (100%) diff --git a/requirements.txt b/requirements/requirements.txt similarity index 100% rename from requirements.txt rename to requirements/requirements.txt diff --git a/requirements_amd.txt b/requirements/requirements_amd.txt similarity index 100% rename from requirements_amd.txt rename to requirements/requirements_amd.txt diff --git a/requirements_amd_noavx2.txt b/requirements/requirements_amd_noavx2.txt similarity index 100% rename from requirements_amd_noavx2.txt rename to requirements/requirements_amd_noavx2.txt diff --git a/requirements_apple_intel.txt b/requirements/requirements_apple_intel.txt similarity index 100% rename from requirements_apple_intel.txt rename to requirements/requirements_apple_intel.txt diff --git a/requirements_apple_silicon.txt b/requirements/requirements_apple_silicon.txt similarity index 100% rename from requirements_apple_silicon.txt rename to requirements/requirements_apple_silicon.txt diff --git a/requirements_cpu_only.txt b/requirements/requirements_cpu_only.txt similarity index 100% rename from requirements_cpu_only.txt rename to requirements/requirements_cpu_only.txt diff --git a/requirements_cpu_only_noavx2.txt b/requirements/requirements_cpu_only_noavx2.txt similarity index 100% rename from requirements_cpu_only_noavx2.txt rename to requirements/requirements_cpu_only_noavx2.txt diff --git a/requirements_noavx2.txt b/requirements/requirements_noavx2.txt similarity index 100% rename from requirements_noavx2.txt rename to requirements/requirements_noavx2.txt diff --git a/requirements_nowheels.txt b/requirements/requirements_nowheels.txt similarity index 100% rename from requirements_nowheels.txt rename to requirements/requirements_nowheels.txt From c178ea02fe962e9094cec077b5432dd3a51c777a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 20 Apr 2025 19:27:38 -0700 Subject: [PATCH 11/21] Revert "Move the requirements*.txt to a requirements folder" This reverts commit 6117ef7d64543f337abe6f0b33dca6b8e9294f9e. --- requirements/requirements.txt => requirements.txt | 0 requirements/requirements_amd.txt => requirements_amd.txt | 0 .../requirements_amd_noavx2.txt => requirements_amd_noavx2.txt | 0 .../requirements_apple_intel.txt => requirements_apple_intel.txt | 0 ...quirements_apple_silicon.txt => requirements_apple_silicon.txt | 0 .../requirements_cpu_only.txt => requirements_cpu_only.txt | 0 ...ements_cpu_only_noavx2.txt => requirements_cpu_only_noavx2.txt | 0 requirements/requirements_noavx2.txt => requirements_noavx2.txt | 0 .../requirements_nowheels.txt => requirements_nowheels.txt | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename requirements/requirements.txt => requirements.txt (100%) rename requirements/requirements_amd.txt => requirements_amd.txt (100%) rename requirements/requirements_amd_noavx2.txt => requirements_amd_noavx2.txt (100%) rename requirements/requirements_apple_intel.txt => requirements_apple_intel.txt (100%) rename requirements/requirements_apple_silicon.txt => requirements_apple_silicon.txt (100%) rename requirements/requirements_cpu_only.txt => requirements_cpu_only.txt (100%) rename requirements/requirements_cpu_only_noavx2.txt => requirements_cpu_only_noavx2.txt (100%) rename requirements/requirements_noavx2.txt => requirements_noavx2.txt (100%) rename requirements/requirements_nowheels.txt => requirements_nowheels.txt (100%) diff --git a/requirements/requirements.txt b/requirements.txt similarity index 100% rename from requirements/requirements.txt rename to requirements.txt diff --git a/requirements/requirements_amd.txt b/requirements_amd.txt similarity index 100% rename from requirements/requirements_amd.txt rename to requirements_amd.txt diff --git a/requirements/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt similarity index 100% rename from requirements/requirements_amd_noavx2.txt rename to requirements_amd_noavx2.txt diff --git a/requirements/requirements_apple_intel.txt b/requirements_apple_intel.txt similarity index 100% rename from requirements/requirements_apple_intel.txt rename to requirements_apple_intel.txt diff --git a/requirements/requirements_apple_silicon.txt b/requirements_apple_silicon.txt similarity index 100% rename from requirements/requirements_apple_silicon.txt rename to requirements_apple_silicon.txt diff --git a/requirements/requirements_cpu_only.txt b/requirements_cpu_only.txt similarity index 100% rename from requirements/requirements_cpu_only.txt rename to requirements_cpu_only.txt diff --git a/requirements/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt similarity index 100% rename from requirements/requirements_cpu_only_noavx2.txt rename to requirements_cpu_only_noavx2.txt diff --git a/requirements/requirements_noavx2.txt b/requirements_noavx2.txt similarity index 100% rename from requirements/requirements_noavx2.txt rename to requirements_noavx2.txt diff --git a/requirements/requirements_nowheels.txt b/requirements_nowheels.txt similarity index 100% rename from requirements/requirements_nowheels.txt rename to requirements_nowheels.txt From 86c3ed3218e0d2033855816bc3bb1054b2071bbc Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 20 Apr 2025 20:00:56 -0700 Subject: [PATCH 12/21] Small change to the unload_model() function --- modules/models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/models.py b/modules/models.py index 2c23462a..99b068aa 100644 --- a/modules/models.py +++ b/modules/models.py @@ -131,6 +131,9 @@ def TensorRT_LLM_loader(model_name): def unload_model(keep_model_name=False): + if shared.model is None: + return + is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer') shared.model = shared.tokenizer = None From 15989c2ed8fb53a0aafcbd294155ee3c61d15ed1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 21 Apr 2025 16:36:35 -0700 Subject: [PATCH 13/21] Make llama.cpp the default loader --- modules/loaders.py | 34 +++++++++++++++++----------------- server.py | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/modules/loaders.py b/modules/loaders.py index f69f1720..7d6afe80 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -4,6 +4,23 @@ from collections import OrderedDict import gradio as gr loaders_and_params = OrderedDict({ + 'llama.cpp': [ + 'n_gpu_layers', + 'threads', + 'threads_batch', + 'batch_size', + 'n_ctx', + 'cache_type', + 'tensor_split', + 'rope_freq_base', + 'compress_pos_emb', + 'flash_attn', + 'row_split', + 'no_kv_offload', + 'no_mmap', + 'mlock', + 'numa', + ], 'Transformers': [ 'gpu_split', 'cpu_memory', @@ -23,23 +40,6 @@ loaders_and_params = OrderedDict({ 'trust_remote_code', 'no_use_fast', ], - 'llama.cpp': [ - 'n_gpu_layers', - 'threads', - 'threads_batch', - 'batch_size', - 'n_ctx', - 'cache_type', - 'tensor_split', - 'rope_freq_base', - 'compress_pos_emb', - 'flash_attn', - 'row_split', - 'no_kv_offload', - 'no_mmap', - 'mlock', - 'numa', - ], 'ExLlamav3_HF': [ 'max_seq_len', 'gpu_split', diff --git a/server.py b/server.py index 1d261566..41a5660d 100644 --- a/server.py +++ b/server.py @@ -85,7 +85,7 @@ def create_interface(): # Force some events to be triggered on page load shared.persistent_interface_state.update({ - 'loader': shared.args.loader or 'Transformers', + 'loader': shared.args.loader or 'llama.cpp', 'mode': shared.settings['mode'] if shared.settings['mode'] == 'instruct' else gr.update(), 'character_menu': shared.args.character or shared.settings['character'], 'instruction_template_str': shared.settings['instruction_template_str'], From 8320190184c285f6a909c11ee040962a498e6e4f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 21 Apr 2025 18:32:23 -0700 Subject: [PATCH 14/21] Fix the exllamav2_HF and exllamav3_HF loaders --- modules/sampler_hijack.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py index ee871a6e..dfdb6914 100644 --- a/modules/sampler_hijack.py +++ b/modules/sampler_hijack.py @@ -15,6 +15,9 @@ from modules import shared from modules.logging_colors import logger from modules.torch_utils import get_device +original_init = transformers.GenerationConfig.__init__ +original_get_logits_processor = transformers.GenerationMixin._get_logits_processor + global_scores = None @@ -484,7 +487,7 @@ def get_logits_processor_patch(self, **kwargs): generation_config.temperature = float(generation_config.temperature) # Must be float # Get the original warpers - warpers = self._get_logits_processor_old(**kwargs) + warpers = original_get_logits_processor(self, **kwargs) for i in range(len(warpers) - 1, -1, -1): # Replace temperature with our modified class. @@ -674,7 +677,7 @@ def get_logits_processor_patch(self, **kwargs): def generation_config_init_patch(self, **kwargs): - self.__init___old(**kwargs) + original_init(self, **kwargs) self.min_p = kwargs.pop("min_p", 0.0) self.dynamic_temperature = kwargs.pop("dynamic_temperature", False) self.dynatemp_low = kwargs.pop("dynatemp_low", 1) @@ -702,8 +705,5 @@ def generation_config_init_patch(self, **kwargs): def hijack_samplers(): - transformers.GenerationMixin._get_logits_processor_old = transformers.GenerationMixin._get_logits_processor transformers.GenerationMixin._get_logits_processor = get_logits_processor_patch - - transformers.GenerationConfig.__init___old = transformers.GenerationConfig.__init__ transformers.GenerationConfig.__init__ = generation_config_init_patch From 78aeabca89275939f66c58dff7530966d9886a18 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 21 Apr 2025 18:33:14 -0700 Subject: [PATCH 15/21] Fix the transformers loader --- modules/models_settings.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index f5f3be7a..ee2ed71b 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -197,13 +197,9 @@ def update_model_parameters(state, initial=False): if initial and element in shared.provided_arguments: continue - if element in ['cpu_memory'] and value == 0: + if element == 'cpu_memory' and value == 0: value = vars(shared.args_defaults)[element] - # Making some simple conversions - if element == 'cpu_memory' and value is not None: - value = f"{value}MiB" - setattr(shared.args, element, value) From ee09e44c85543025def97a990888f4086b80b190 Mon Sep 17 00:00:00 2001 From: oobabooga Date: Tue, 22 Apr 2025 09:25:57 -0300 Subject: [PATCH 16/21] Portable version (#6868) --- .github/dependabot.yml | 10 +- .github/workflows/build-everything-tgw.yml | 49 +++++ .../workflows/build-portable-release-cuda.yml | 182 +++++++++++++++++ .github/workflows/build-portable-release.yml | 192 ++++++++++++++++++ one_click.py | 12 +- .../full/requirements.txt | 4 +- .../full/requirements_amd.txt | 2 +- .../full/requirements_amd_noavx2.txt | 2 +- .../full/requirements_apple_intel.txt | 5 +- .../full/requirements_apple_silicon.txt | 5 +- .../full/requirements_cpu_only.txt | 4 +- .../full/requirements_cpu_only_noavx2.txt | 4 +- .../full/requirements_noavx2.txt | 4 +- .../full/requirements_nowheels.txt | 0 requirements/portable/requirements.txt | 19 ++ requirements/portable/requirements_amd.txt | 18 ++ .../portable/requirements_amd_noavx2.txt | 18 ++ .../portable/requirements_apple_intel.txt | 19 ++ .../portable/requirements_apple_silicon.txt | 20 ++ .../portable/requirements_cpu_only.txt | 19 ++ .../portable/requirements_cpu_only_noavx2.txt | 19 ++ requirements/portable/requirements_noavx2.txt | 19 ++ .../portable/requirements_nowheels.txt | 15 ++ start_linux.sh | 6 + start_macos.sh | 6 + start_windows.bat | 6 + 26 files changed, 638 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/build-everything-tgw.yml create mode 100644 .github/workflows/build-portable-release-cuda.yml create mode 100644 .github/workflows/build-portable-release.yml rename requirements.txt => requirements/full/requirements.txt (86%) rename requirements_amd.txt => requirements/full/requirements_amd.txt (82%) rename requirements_amd_noavx2.txt => requirements/full/requirements_amd_noavx2.txt (82%) rename requirements_apple_silicon.txt => requirements/full/requirements_apple_intel.txt (51%) rename requirements_apple_intel.txt => requirements/full/requirements_apple_silicon.txt (70%) rename requirements_cpu_only.txt => requirements/full/requirements_cpu_only.txt (63%) rename requirements_cpu_only_noavx2.txt => requirements/full/requirements_cpu_only_noavx2.txt (63%) rename requirements_noavx2.txt => requirements/full/requirements_noavx2.txt (86%) rename requirements_nowheels.txt => requirements/full/requirements_nowheels.txt (100%) create mode 100644 requirements/portable/requirements.txt create mode 100644 requirements/portable/requirements_amd.txt create mode 100644 requirements/portable/requirements_amd_noavx2.txt create mode 100644 requirements/portable/requirements_apple_intel.txt create mode 100644 requirements/portable/requirements_apple_silicon.txt create mode 100644 requirements/portable/requirements_cpu_only.txt create mode 100644 requirements/portable/requirements_cpu_only_noavx2.txt create mode 100644 requirements/portable/requirements_noavx2.txt create mode 100644 requirements/portable/requirements_nowheels.txt diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 93aaf445..69918a57 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,8 +5,14 @@ version: 2 updates: - - package-ecosystem: "pip" # See documentation for possible values - directory: "/" # Location of package manifests + - package-ecosystem: "pip" + directory: "/requirements/full/" + target-branch: "dev" + schedule: + interval: "weekly" + + - package-ecosystem: "pip" + directory: "/requirements/portable/" target-branch: "dev" schedule: interval: "weekly" diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml new file mode 100644 index 00000000..91ca5f6b --- /dev/null +++ b/.github/workflows/build-everything-tgw.yml @@ -0,0 +1,49 @@ +name: Build Everything TGW + +on: + workflow_dispatch: + inputs: + version: + description: 'Version tag of text-generation-webui to build: v3.0' + default: 'v3.0' + required: true + type: string + +permissions: + contents: write + +jobs: + build_release_cuda_windows: + name: CUDA Windows + uses: ./.github/workflows/build-portable-release-cuda.yml + with: + version: ${{ inputs.version }} + config: 'os:windows-2019' + + build_release_cuda_linux: + name: CUDA Linux + uses: ./.github/workflows/build-portable-release-cuda.yml + with: + version: ${{ inputs.version }} + config: 'os:ubuntu-22.04' + + build_release_cpu_windows: + name: CPU Windows + uses: ./.github/workflows/build-portable-release.yml + with: + version: ${{ inputs.version }} + config: 'os:windows-2019' + + build_release_cpu_linux: + name: CPU Linux + uses: ./.github/workflows/build-portable-release.yml + with: + version: ${{ inputs.version }} + config: 'os:ubuntu-22.04' + + build_release_macos: + name: macOS + uses: ./.github/workflows/build-portable-release.yml + with: + version: ${{ inputs.version }} + config: 'os:macos-13,macos-14' diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml new file mode 100644 index 00000000..b3cc587a --- /dev/null +++ b/.github/workflows/build-portable-release-cuda.yml @@ -0,0 +1,182 @@ +name: Build CUDA + +on: + workflow_dispatch: + inputs: + version: + description: 'Version tag of text-generation-webui to build: v3.0' + default: 'v3.0' + required: true + type: string + config: + description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2' + default: 'Default' + required: false + type: string + exclude: + description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2' + default: 'None' + required: false + type: string + workflow_call: + inputs: + version: + description: 'Version tag of text-generation-webui to build: v3.0' + default: 'v3.0' + required: true + type: string + config: + description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2' + default: 'Default' + required: false + type: string + exclude: + description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2' + default: 'None' + required: false + type: string + +permissions: + contents: write + +jobs: + define_matrix: + name: Define Build Matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + defaults: + run: + shell: pwsh + env: + CONFIGIN: ${{ inputs.config }} + EXCLUDEIN: ${{ inputs.exclude }} + + steps: + - name: Define Job Output + id: set-matrix + run: | + $matrix = @{ + 'os' = @('ubuntu-22.04', 'windows-2019') + 'pyver' = @("3.11") + 'avx' = @("AVX2") + 'cuda' = @("11.7", "12.4") + } + + if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})} + + if ($env:EXCLUDEIN -ne 'None') { + $exclusions = @() + $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData + $matrix['exclude'] = $exclusions + } + + $matrixOut = ConvertTo-Json $matrix -Compress + Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT + + build_wheels: + name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }} CUDA ${{ matrix.cuda }} + needs: define_matrix + runs-on: ${{ matrix.os }} + strategy: + matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} + defaults: + run: + shell: pwsh + env: + AVXVER: ${{ matrix.avx }} + PCKGVER: ${{ inputs.version }} + + steps: + - uses: actions/checkout@v4 + with: + repository: 'oobabooga/text-generation-webui' + ref: ${{ inputs.version }} + submodules: 'recursive' + + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.pyver }} + + - name: Build Package + shell: bash + run: | + rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker + + # Define common variables + CUDA_VERSION="${{ matrix.cuda }}" + AVX_SUPPORT="${{ matrix.avx }}" + VERSION="${{ inputs.version }}" + + # 1. Set platform-specific variables + if [[ "$RUNNER_OS" == "Windows" ]]; then + PLATFORM="windows" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-pc-windows-msvc-install_only.tar.gz" + PIP_PATH="portable_env/python.exe -m pip" + PACKAGES_PATH="portable_env/Lib/site-packages" + ZIP_CMD="powershell -Command \"Compress-Archive -Path text-generation-webui -DestinationPath" + rm start_linux.sh start_macos.sh + else + PLATFORM="linux" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz" + PIP_PATH="portable_env/bin/python -m pip" + PACKAGES_PATH="portable_env/lib/python3.11/site-packages" + ZIP_CMD="zip -r" + rm start_macos.sh start_windows.bat + fi + + # 2. Download and extract Python + cd .. + echo "Downloading Python for $PLATFORM..." + curl -L -o python-build.tar.gz "$PYTHON_URL" + tar -xzf python-build.tar.gz + mv python text-generation-webui/portable_env + + # 3. Prepare requirements file based on AVX and CUDA + if [[ "$AVX_SUPPORT" == "AVX2" ]]; then + BASE_REQ_FILE="requirements/portable/requirements.txt" + else + BASE_REQ_FILE="requirements/portable/requirements_noavx2.txt" + fi + + # Create CUDA-specific requirements file if needed + cd text-generation-webui + if [[ "$CUDA_VERSION" == "11.7" ]]; then + echo "Creating CUDA 11.7 specific requirements file" + sed 's/cu124/cu117/g' "$BASE_REQ_FILE" > requirements_cuda_temp.txt + REQ_FILE="requirements_cuda_temp.txt" + else + REQ_FILE="$BASE_REQ_FILE" + fi + + # 4. Install packages + echo "Installing Python packages from $REQ_FILE..." + $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE" + + # 5. Clean up + if [[ "$CUDA_VERSION" == "11.7" ]]; then + rm requirements_cuda_temp.txt + fi + + # 6. Create ZIP file + cd .. + ZIP_NAME="textgen-portable-${VERSION}-${PLATFORM}-cuda${CUDA_VERSION}.zip" + echo "Creating archive: $ZIP_NAME" + + if [[ "$RUNNER_OS" == "Windows" ]]; then + powershell -Command "Compress-Archive -Path text-generation-webui -DestinationPath $ZIP_NAME" + else + zip -r "$ZIP_NAME" text-generation-webui + fi + + - name: Upload files to a GitHub release + id: upload-release + uses: svenstaro/upload-release-action@2.7.0 + continue-on-error: true + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: ../textgen-portable-${{ inputs.version }}*.zip + tag: ${{ inputs.version }} + file_glob: true + make_latest: false + overwrite: true diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml new file mode 100644 index 00000000..bb61eb8f --- /dev/null +++ b/.github/workflows/build-portable-release.yml @@ -0,0 +1,192 @@ +name: Build CPU and macOS + +on: + workflow_dispatch: + inputs: + version: + description: 'Version tag of text-generation-webui to build: v3.0' + default: 'v3.0' + required: true + type: string + config: + description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2' + default: 'Default' + required: false + type: string + exclude: + description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2' + default: 'None' + required: false + type: string + workflow_call: + inputs: + version: + description: 'Version tag of text-generation-webui to build: v3.0' + default: 'v3.0' + required: true + type: string + config: + description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2' + default: 'Default' + required: false + type: string + exclude: + description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2' + default: 'None' + required: false + type: string + +permissions: + contents: write + +jobs: + define_matrix: + name: Define Build Matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + defaults: + run: + shell: pwsh + env: + CONFIGIN: ${{ inputs.config }} + EXCLUDEIN: ${{ inputs.exclude }} + + steps: + - name: Define Job Output + id: set-matrix + run: | + $matrix = @{ + 'os' = @('ubuntu-22.04', 'windows-2019', 'macos-13', 'macos-14') + 'pyver' = @("3.11") + 'avx' = @("AVX2") + } + + if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})} + + if ($env:EXCLUDEIN -ne 'None') { + $exclusions = @() + $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData + $matrix['exclude'] = $exclusions + } + + $matrixOut = ConvertTo-Json $matrix -Compress + Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT + + build_wheels: + name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }} + needs: define_matrix + runs-on: ${{ matrix.os }} + strategy: + matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} + defaults: + run: + shell: pwsh + env: + AVXVER: ${{ matrix.avx }} + PCKGVER: ${{ inputs.version }} + + steps: + - uses: actions/checkout@v4 + with: + repository: 'oobabooga/text-generation-webui' + ref: ${{ inputs.version }} + submodules: 'recursive' + + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.pyver }} + + - name: Build Package + shell: bash + run: | + rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker + + # Define common variables + AVX_SUPPORT="${{ matrix.avx }}" + VERSION="${{ inputs.version }}" + OS_TYPE="${{ matrix.os }}" + + # 1. Set platform-specific variables + if [[ "$RUNNER_OS" == "Windows" ]]; then + PLATFORM="windows-cpu" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-pc-windows-msvc-install_only.tar.gz" + PIP_PATH="portable_env/python.exe -m pip" + PACKAGES_PATH="portable_env/Lib/site-packages" + rm start_linux.sh start_macos.sh + elif [[ "$RUNNER_OS" == "macOS" ]]; then + if [[ "$OS_TYPE" == "macos-13" ]]; then + PLATFORM="macos-x86_64" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-apple-darwin-install_only.tar.gz" + REQ_TYPE="apple_intel" + else + PLATFORM="macos-arm64" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-aarch64-apple-darwin-install_only.tar.gz" + REQ_TYPE="apple_silicon" + fi + PIP_PATH="portable_env/bin/python -m pip" + PACKAGES_PATH="portable_env/lib/python3.11/site-packages" + rm start_linux.sh start_windows.bat + else + # Linux case + PLATFORM="linux-cpu" + PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz" + PIP_PATH="portable_env/bin/python -m pip" + PACKAGES_PATH="portable_env/lib/python3.11/site-packages" + rm start_macos.sh start_windows.bat + fi + + # 2. Download and extract Python + echo "Downloading Python for $PLATFORM..." + cd .. + curl -L -o python-build.tar.gz "$PYTHON_URL" + tar -xzf python-build.tar.gz + mv python text-generation-webui/portable_env + + # 3. Prepare requirements file based on platform and AVX + cd text-generation-webui + + # Select requirements file based on platform + if [[ "$RUNNER_OS" == "macOS" ]]; then + if [[ "$OS_TYPE" == "macos-13" ]]; then + REQ_FILE="requirements/portable/requirements_apple_intel.txt" + else + REQ_FILE="requirements/portable/requirements_apple_silicon.txt" + fi + else + # For Windows and Linux, check AVX support + if [[ "$AVX_SUPPORT" == "AVX2" ]]; then + REQ_FILE="requirements/portable/requirements_cpu_only.txt" + else + REQ_FILE="requirements/portable/requirements_cpu_only_noavx2.txt" + fi + fi + + echo "Using requirements file: $REQ_FILE" + + # 4. Install packages + echo "Installing Python packages from $REQ_FILE..." + $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE" + + # 5. Create ZIP file + cd .. + ZIP_NAME="textgen-portable-${VERSION}-${PLATFORM}.zip" + echo "Creating archive: $ZIP_NAME" + + if [[ "$RUNNER_OS" == "Windows" ]]; then + powershell -Command "Compress-Archive -Path text-generation-webui -DestinationPath $ZIP_NAME" + else + zip -r "$ZIP_NAME" text-generation-webui + fi + + - name: Upload files to a GitHub release + id: upload-release + uses: svenstaro/upload-release-action@2.7.0 + continue-on-error: true + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: ../textgen-portable-${{ inputs.version }}*.zip + tag: ${{ inputs.version }} + file_glob: true + make_latest: false + overwrite: true diff --git a/one_click.py b/one_click.py index eff2ed9f..04b729eb 100644 --- a/one_click.py +++ b/one_click.py @@ -356,14 +356,18 @@ def update_requirements(initial_installation=False, pull=True): ) torver = torch_version() + requirements_base = os.path.join("requirements", "full") + if "+rocm" in torver: - requirements_file = "requirements_amd" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt" + file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt" elif "+cpu" in torver or "+cxx11" in torver: - requirements_file = "requirements_cpu_only" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt" + file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt" elif is_macos(): - requirements_file = "requirements_apple_" + ("intel" if is_x86_64() else "silicon") + ".txt" + file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt" else: - requirements_file = "requirements" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt" + file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt" + + requirements_file = os.path.join(requirements_base, file_name) # Load state from JSON file current_commit = get_current_commit() diff --git a/requirements.txt b/requirements/full/requirements.txt similarity index 86% rename from requirements.txt rename to requirements/full/requirements.txt index 9e55bb33..b9afaa07 100644 --- a/requirements.txt +++ b/requirements/full/requirements.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements_amd.txt b/requirements/full/requirements_amd.txt similarity index 82% rename from requirements_amd.txt rename to requirements/full/requirements_amd.txt index ba8bc80c..96cb299d 100644 --- a/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -29,6 +29,6 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt similarity index 82% rename from requirements_amd_noavx2.txt rename to requirements/full/requirements_amd_noavx2.txt index d921fbb0..0f1a4fc2 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -29,6 +29,6 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements_apple_silicon.txt b/requirements/full/requirements_apple_intel.txt similarity index 51% rename from requirements_apple_silicon.txt rename to requirements/full/requirements_apple_intel.txt index 3e7fcd02..8d1e5294 100644 --- a/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -29,8 +29,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl diff --git a/requirements_apple_intel.txt b/requirements/full/requirements_apple_silicon.txt similarity index 70% rename from requirements_apple_intel.txt rename to requirements/full/requirements_apple_silicon.txt index 0b92376a..a44ff3cb 100644 --- a/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -29,7 +29,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl diff --git a/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt similarity index 63% rename from requirements_cpu_only.txt rename to requirements/full/requirements_cpu_only.txt index 43d54534..35855162 100644 --- a/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt similarity index 63% rename from requirements_cpu_only_noavx2.txt rename to requirements/full/requirements_cpu_only_noavx2.txt index b4444443..0716455e 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt similarity index 86% rename from requirements_noavx2.txt rename to requirements/full/requirements_noavx2.txt index 351ea83e..98c43b88 100644 --- a/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt similarity index 100% rename from requirements_nowheels.txt rename to requirements/full/requirements_nowheels.txt diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt new file mode 100644 index 00000000..c3336fc7 --- /dev/null +++ b/requirements/portable/requirements.txt @@ -0,0 +1,19 @@ +fastapi==0.112.4 +gradio==4.37.* +jinja2==3.1.6 +markdown +numpy==1.26.* +pydantic==2.8.2 +pyyaml +requests +rich +tqdm + +# API +flask_cloudflared==0.0.14 +sse-starlette==1.6.5 +tiktoken + +# CUDA wheels +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt new file mode 100644 index 00000000..4855225f --- /dev/null +++ b/requirements/portable/requirements_amd.txt @@ -0,0 +1,18 @@ +fastapi==0.112.4 +gradio==4.37.* +jinja2==3.1.6 +markdown +numpy==1.26.* +pydantic==2.8.2 +pyyaml +requests +rich +tqdm + +# API +flask_cloudflared==0.0.14 +sse-starlette==1.6.5 +tiktoken + +# AMD wheels +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt new file mode 100644 index 00000000..f40daa8a --- /dev/null +++ b/requirements/portable/requirements_amd_noavx2.txt @@ -0,0 +1,18 @@ +fastapi==0.112.4 +gradio==4.37.* +jinja2==3.1.6 +markdown +numpy==1.26.* +pydantic==2.8.2 +pyyaml +requests +rich +tqdm + +# API +flask_cloudflared==0.0.14 +sse-starlette==1.6.5 +tiktoken + +# AMD wheels +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt new file mode 100644 index 00000000..1ede251e --- /dev/null +++ b/requirements/portable/requirements_apple_intel.txt @@ -0,0 +1,19 @@ +fastapi==0.112.4 +gradio==4.37.* +jinja2==3.1.6 +markdown +numpy==1.26.* +pydantic==2.8.2 +pyyaml +requests +rich +tqdm + +# API +flask_cloudflared==0.0.14 +sse-starlette==1.6.5 +tiktoken + +# Mac wheels +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt new file mode 100644 index 00000000..26b68bff --- /dev/null +++ b/requirements/portable/requirements_apple_silicon.txt @@ -0,0 +1,20 @@ +fastapi==0.112.4 +gradio==4.37.* +jinja2==3.1.6 +markdown +numpy==1.26.* +pydantic==2.8.2 +pyyaml +requests +rich +tqdm + +# API +flask_cloudflared==0.0.14 +sse-starlette==1.6.5 +tiktoken + +# Mac wheels +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt new file mode 100644 index 00000000..456a0499 --- /dev/null +++ b/requirements/portable/requirements_cpu_only.txt @@ -0,0 +1,19 @@ +fastapi==0.112.4 +gradio==4.37.* +jinja2==3.1.6 +markdown +numpy==1.26.* +pydantic==2.8.2 +pyyaml +requests +rich +tqdm + +# API +flask_cloudflared==0.0.14 +sse-starlette==1.6.5 +tiktoken + +# llama.cpp (CPU only, AVX2) +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt new file mode 100644 index 00000000..7cd2dd34 --- /dev/null +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -0,0 +1,19 @@ +fastapi==0.112.4 +gradio==4.37.* +jinja2==3.1.6 +markdown +numpy==1.26.* +pydantic==2.8.2 +pyyaml +requests +rich +tqdm + +# API +flask_cloudflared==0.0.14 +sse-starlette==1.6.5 +tiktoken + +# llama.cpp (CPU only, no AVX2) +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt new file mode 100644 index 00000000..b47b8bbc --- /dev/null +++ b/requirements/portable/requirements_noavx2.txt @@ -0,0 +1,19 @@ +fastapi==0.112.4 +gradio==4.37.* +jinja2==3.1.6 +markdown +numpy==1.26.* +pydantic==2.8.2 +pyyaml +requests +rich +tqdm + +# API +flask_cloudflared==0.0.14 +sse-starlette==1.6.5 +tiktoken + +# CUDA wheels +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt new file mode 100644 index 00000000..6f9566ba --- /dev/null +++ b/requirements/portable/requirements_nowheels.txt @@ -0,0 +1,15 @@ +fastapi==0.112.4 +gradio==4.37.* +jinja2==3.1.6 +markdown +numpy==1.26.* +pydantic==2.8.2 +pyyaml +requests +rich +tqdm + +# API +flask_cloudflared==0.0.14 +sse-starlette==1.6.5 +tiktoken diff --git a/start_linux.sh b/start_linux.sh index 256604cb..00082f07 100755 --- a/start_linux.sh +++ b/start_linux.sh @@ -2,6 +2,12 @@ cd "$(dirname "${BASH_SOURCE[0]}")" +# Portable install case +if [ -d "portable_env" ]; then + ./portable_env/bin/python3 server.py --api --auto-launch "$@" + exit $? +fi + if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi # deactivate existing conda envs as needed to avoid conflicts diff --git a/start_macos.sh b/start_macos.sh index 02f1011a..628f59cc 100755 --- a/start_macos.sh +++ b/start_macos.sh @@ -2,6 +2,12 @@ cd "$(dirname "${BASH_SOURCE[0]}")" +# Portable install case +if [ -d "portable_env" ]; then + ./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@" + exit $? +fi + if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi # deactivate existing conda envs as needed to avoid conflicts diff --git a/start_windows.bat b/start_windows.bat index 2e42d6fa..451b85e0 100755 --- a/start_windows.bat +++ b/start_windows.bat @@ -3,6 +3,12 @@ setlocal enabledelayedexpansion cd /D "%~dp0" +@rem Portable install case +if exist "portable_env" ( + .\portable_env\python.exe server.py --api --auto-launch %* + exit /b %errorlevel% +) + set PATH=%PATH%;%SystemRoot%\system32 echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end From 008c6dd6820556b62e033a17119ec34d62e87c7a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 22 Apr 2025 05:32:41 -0700 Subject: [PATCH 17/21] Lint --- modules/exllamav2.py | 2 +- modules/exllamav2_hf.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 0289bb21..92e95ac6 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -3,6 +3,7 @@ import traceback from pathlib import Path import torch + from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, @@ -15,7 +16,6 @@ from exllamav2 import ( ExLlamaV2Tokenizer ) from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator - from modules import shared from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index b159d9ce..5ee0bf60 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -4,6 +4,15 @@ from pathlib import Path from typing import Any, Dict, Optional, Union import torch +from torch.nn import CrossEntropyLoss +from transformers import ( + GenerationConfig, + GenerationMixin, + PretrainedConfig, + PreTrainedModel +) +from transformers.modeling_outputs import CausalLMOutputWithPast + from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, @@ -14,15 +23,6 @@ from exllamav2 import ( ExLlamaV2Cache_TP, ExLlamaV2Config ) -from torch.nn import CrossEntropyLoss -from transformers import ( - GenerationConfig, - GenerationMixin, - PretrainedConfig, - PreTrainedModel -) -from transformers.modeling_outputs import CausalLMOutputWithPast - from modules import shared from modules.logging_colors import logger From a3031795a3ef5f77dff968cbf99cf3c558be3fe8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 22 Apr 2025 07:58:19 -0700 Subject: [PATCH 18/21] Update the zip filename --- .github/workflows/build-portable-release-cuda.yml | 3 ++- .github/workflows/build-portable-release.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml index b3cc587a..73be7f0c 100644 --- a/.github/workflows/build-portable-release-cuda.yml +++ b/.github/workflows/build-portable-release-cuda.yml @@ -160,7 +160,8 @@ jobs: # 6. Create ZIP file cd .. - ZIP_NAME="textgen-portable-${VERSION}-${PLATFORM}-cuda${CUDA_VERSION}.zip" + VERSION_CLEAN="${VERSION#v}" + ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip" echo "Creating archive: $ZIP_NAME" if [[ "$RUNNER_OS" == "Windows" ]]; then diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml index bb61eb8f..0ea91286 100644 --- a/.github/workflows/build-portable-release.yml +++ b/.github/workflows/build-portable-release.yml @@ -170,7 +170,8 @@ jobs: # 5. Create ZIP file cd .. - ZIP_NAME="textgen-portable-${VERSION}-${PLATFORM}.zip" + VERSION_CLEAN="${VERSION#v}" + ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip" echo "Creating archive: $ZIP_NAME" if [[ "$RUNNER_OS" == "Windows" ]]; then From da1919baaefc6032a23472f0409aa13b90f15fc8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 22 Apr 2025 08:02:08 -0700 Subject: [PATCH 19/21] Update the README --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index a15e974f..f62e3508 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,14 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## How to install +#### Option 1: Portable builds + +Compatible with GGUF (llama.cpp) models, just unzip and run, no installation. Available for Windows, Linux, and macOS. + +Download from: https://github.com/oobabooga/text-generation-webui/releases + +#### Option 2: One-click installer + 1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip). 2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`. 3) Select your GPU vendor when asked. @@ -352,6 +360,10 @@ Run `python download-model.py --help` to see all the options. https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb +## Community + +https://www.reddit.com/r/Oobabooga/ + ## Acknowledgment In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition. From 39cbb5fee038e731c8f0880d2c413fb022896eb4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 22 Apr 2025 08:03:25 -0700 Subject: [PATCH 20/21] Lint --- modules/exllamav2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 92e95ac6..0289bb21 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -3,7 +3,6 @@ import traceback from pathlib import Path import torch - from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, @@ -16,6 +15,7 @@ from exllamav2 import ( ExLlamaV2Tokenizer ) from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator + from modules import shared from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length From 25cf3600aa22eb367aee5314ec31291ae637a8df Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 22 Apr 2025 08:04:02 -0700 Subject: [PATCH 21/21] Lint --- modules/exllamav2_hf.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index 5ee0bf60..b159d9ce 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -4,15 +4,6 @@ from pathlib import Path from typing import Any, Dict, Optional, Union import torch -from torch.nn import CrossEntropyLoss -from transformers import ( - GenerationConfig, - GenerationMixin, - PretrainedConfig, - PreTrainedModel -) -from transformers.modeling_outputs import CausalLMOutputWithPast - from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, @@ -23,6 +14,15 @@ from exllamav2 import ( ExLlamaV2Cache_TP, ExLlamaV2Config ) +from torch.nn import CrossEntropyLoss +from transformers import ( + GenerationConfig, + GenerationMixin, + PretrainedConfig, + PreTrainedModel +) +from transformers.modeling_outputs import CausalLMOutputWithPast + from modules import shared from modules.logging_colors import logger