From d03923924a853ffe3662f498fd4f4e5aedab8fb3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 6 Mar 2026 16:52:02 -0300 Subject: [PATCH] Several small fixes - Stop llama-server subprocess on model unload instead of relying on GC - Fix tool_calls[].index being string instead of int in API responses - Omit tool_calls key from API response when empty per OpenAI spec - Prevent division by zero when micro_batch_size > batch_size in training - Copy sampler_priority list before mutating in ExLlamaV3 - Normalize presence/frequency_penalty names for ExLlamaV3 sampler sorting - Restore original chat_template after training instead of leaving it mutated --- extensions/openai/completions.py | 4 ++-- modules/exllamav3.py | 7 ++++++- modules/models.py | 2 ++ modules/training.py | 7 ++++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index cabfce99..8ba031c1 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -343,7 +343,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p if len(tool_call) > 0: for tc in tool_call: tc["id"] = getToolCallId() - tc["index"] = str(len(tool_calls)) + tc["index"] = len(tool_calls) tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"]) tool_calls.append(tc) end_last_tool_call = len(answer) @@ -391,7 +391,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p resp_list: [{ "index": 0, "finish_reason": stop_reason, - "message": {"role": "assistant", "content": answer, "tool_calls": tool_calls}, + "message": {"role": "assistant", "content": answer, **({"tool_calls": tool_calls} if tool_calls else {})}, }], "usage": { "prompt_tokens": token_count, diff --git a/modules/exllamav3.py b/modules/exllamav3.py index af5745bc..df37ddbe 100644 --- a/modules/exllamav3.py +++ b/modules/exllamav3.py @@ -339,11 +339,16 @@ class Exllamav3Model: # 3. Get the priority list and handle temperature_last default_priority = ['repetition_penalty', 'presence_frequency_penalty', 'top_k', 'top_p', 'min_p', 'temperature'] - sampler_priority = state.get('sampler_priority') or default_priority + sampler_priority = list(state.get('sampler_priority') or default_priority) if state['temperature_last'] and 'temperature' in sampler_priority: sampler_priority.append(sampler_priority.pop(sampler_priority.index('temperature'))) + # The preset system uses separate 'presence_penalty' and + # 'frequency_penalty', but ExLlamaV3 has a single combined + # SS_PresFreqP sampler. Normalize to the combined name. + sampler_priority = ['presence_frequency_penalty' if x in ('presence_penalty', 'frequency_penalty') else x for x in sampler_priority] + # 4. Sort the unordered list based on the priority list def custom_sort_key(sampler_obj): class_name = sampler_obj.__class__.__name__ diff --git a/modules/models.py b/modules/models.py index bc5585cf..48d68b0b 100644 --- a/modules/models.py +++ b/modules/models.py @@ -126,6 +126,8 @@ def unload_model(keep_model_name=False): if model_class_name in ['Exllamav3Model', 'Exllamav3HF', 'TensorRTLLMModel']: shared.model.unload() + elif model_class_name == 'LlamaServer': + shared.model.stop() shared.model = shared.tokenizer = None shared.lora_names = [] diff --git a/modules/training.py b/modules/training.py index 87539461..2e172d22 100644 --- a/modules/training.py +++ b/modules/training.py @@ -333,7 +333,8 @@ def do_train(lora_name: str, always_override: bool, all_linear: bool, q_proj_en: yield "Cannot input zeroes." return - gradient_accumulation_steps = batch_size // micro_batch_size + gradient_accumulation_steps = max(1, batch_size // micro_batch_size) + original_chat_template = getattr(shared.tokenizer, 'chat_template', None) if shared.tokenizer.pad_token_id is None: shared.tokenizer.pad_token_id = shared.tokenizer.eos_token_id shared.tokenizer.padding_side = "right" @@ -820,6 +821,10 @@ def do_train(lora_name: str, always_override: bool, all_linear: bool, q_proj_en: logger.info("Training complete, saving") lora_model.save_pretrained(lora_file_path) + # Restore the original chat_template if we changed it for training + if shared.tokenizer is not None and hasattr(shared.tokenizer, 'chat_template'): + shared.tokenizer.chat_template = original_chat_template + if WANT_INTERRUPT: logger.info("Training interrupted.") yield f"Interrupted. Incomplete LoRA saved to `{lora_file_path}`."