diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index 6643ed16..6bd3749f 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -43,6 +43,7 @@ class GenerationOptions(BaseModel): ban_eos_token: bool = False add_bos_token: bool = True enable_thinking: bool = True + reasoning_effort: str = "medium" skip_special_tokens: bool = True static_cache: bool = False truncation_length: int = 0 diff --git a/modules/chat.py b/modules/chat.py index 827b6050..1ab91b5e 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -175,7 +175,8 @@ def generate_chat_prompt(user_input, state, **kwargs): builtin_tools=None, tools=state['tools'] if 'tools' in state else None, tools_in_user_message=False, - add_generation_prompt=False + add_generation_prompt=False, + reasoning_effort=state['reasoning_effort'] ) chat_renderer = partial( @@ -210,7 +211,57 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.insert(insert_pos, {"role": "tool", "content": tool_msg}) if assistant_msg: - messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) + # Handle GPT-OSS as a special case + if '<|channel|>analysis<|message|>' in assistant_msg or '<|channel|>final<|message|>' in assistant_msg: + + thinking_content = "" + final_content = "" + + # Extract analysis content if present + if '<|channel|>analysis<|message|>' in assistant_msg: + # Split the message by the analysis tag to isolate the content that follows + parts = assistant_msg.split('<|channel|>analysis<|message|>', 1) + if len(parts) > 1: + # The content is everything after the tag + potential_content = parts[1] + + # Now, find the end of this content block + analysis_end_tag = '<|end|>' + if analysis_end_tag in potential_content: + thinking_content = potential_content.split(analysis_end_tag, 1)[0].strip() + else: + # Fallback: if no <|end|> tag, stop at the start of the final channel if it exists + final_channel_tag = '<|channel|>final<|message|>' + if final_channel_tag in potential_content: + thinking_content = potential_content.split(final_channel_tag, 1)[0].strip() + else: + thinking_content = potential_content.strip() + + # Extract final content if present + final_tag_to_find = '<|channel|>final<|message|>' + if final_tag_to_find in assistant_msg: + # Split the message by the final tag to isolate the content that follows + parts = assistant_msg.split(final_tag_to_find, 1) + if len(parts) > 1: + # The content is everything after the tag + potential_content = parts[1] + + # Now, find the end of this content block + final_end_tag = '<|end|>' + if final_end_tag in potential_content: + final_content = potential_content.split(final_end_tag, 1)[0].strip() + else: + final_content = potential_content.strip() + + # Insert as structured message + msg_dict = {"role": "assistant", "content": final_content} + if '<|channel|>analysis<|message|>' in assistant_msg: + msg_dict["thinking"] = thinking_content + + messages.insert(insert_pos, msg_dict) + + else: + messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']: # Check for user message attachments in metadata @@ -295,18 +346,44 @@ def generate_chat_prompt(user_input, state, **kwargs): if len(suffix) > 0: prompt = prompt[:-len(suffix)] else: - if _continue: - suffix = get_generation_prompt(renderer, impersonate=impersonate)[1] - if len(suffix) > 0: - prompt = prompt[:-len(suffix)] + # Handle GPT-OSS as a special case when continuing + if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']: + last_message_to_continue = messages[-1] + prompt = renderer(messages=messages[:-1]) + + # Start the assistant turn wrapper + assistant_reply_so_far = "<|start|>assistant" + + if 'thinking' in last_message_to_continue: + assistant_reply_so_far += f"<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>" + + assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}" + + prompt += assistant_reply_so_far + else: - prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] - if state['mode'] == 'chat' and not impersonate: - prefix = apply_extensions('bot_prefix', prefix, state) + prompt = renderer(messages=messages) + if _continue: + suffix = get_generation_prompt(renderer, impersonate=impersonate)[1] + if len(suffix) > 0: + prompt = prompt[:-len(suffix)] + else: + prefix = get_generation_prompt(renderer, impersonate=impersonate)[0] - prompt += prefix + # Handle GPT-OSS as a special case when not continuing + if '<|channel|>final<|message|>' in state['instruction_template_str']: + if prefix.endswith("<|channel|>final<|message|>"): + prefix = prefix[:-len("<|channel|>final<|message|>")] - if state['mode'] == 'instruct' and not any((_continue, impersonate, state['enable_thinking'])): + if impersonate: + prefix += "<|message|>" + + if state['mode'] == 'chat' and not impersonate: + prefix = apply_extensions('bot_prefix', prefix, state) + + prompt += prefix + + if state['mode'] == 'instruct' and 'enable_thinking' in state['instruction_template_str'] and not any((_continue, impersonate, state['enable_thinking'])): prompt += get_thinking_suppression_string(instruction_template) return prompt @@ -459,6 +536,12 @@ def get_stopping_strings(state): result = [item for item in stopping_strings if not any(item.startswith(other) and item != other for other in stopping_strings)] result = list(set(result)) + # Handle GPT-OSS as a special case + if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result: + result.remove("<|end|>") + result.append("<|result|>") + result = list(set(result)) + if shared.args.verbose: logger.info("STOPPING_STRINGS=") pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(result) @@ -611,9 +694,9 @@ def generate_search_query(user_message, state): # Use a minimal state for search query generation but keep the full history search_state = state.copy() - search_state['max_new_tokens'] = 64 - search_state['auto_max_new_tokens'] = False + search_state['auto_max_new_tokens'] = True search_state['enable_thinking'] = False + search_state['reasoning_effort'] = 'low' search_state['start_with'] = "" # Generate the full prompt using existing history + augmented message @@ -623,6 +706,12 @@ def generate_search_query(user_message, state): for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True): query = reply + # Check for thinking block delimiters and extract content after them + if "" in query: + query = query.rsplit("", 1)[1] + elif "<|start|>assistant<|channel|>final<|message|>" in query: + query = query.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1] + # Strip and remove surrounding quotes if present query = query.strip() if len(query) >= 2 and query.startswith('"') and query.endswith('"'): @@ -643,6 +732,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess output = apply_extensions('history', output) state = apply_extensions('state', state) + # Handle GPT-OSS as a special case + if '<|channel|>final<|message|>' in state['instruction_template_str']: + state['skip_special_tokens'] = False + # Let the jinja2 template handle the BOS token if state['mode'] in ['instruct', 'chat-instruct']: state['add_bos_token'] = False @@ -1175,6 +1268,9 @@ def save_last_chat_state(character, mode, unique_id): def load_history(unique_id, character, mode): p = get_history_file_path(unique_id, character, mode) + if not p.exists(): + return {'internal': [], 'visible': [], 'metadata': {}} + f = json.loads(open(p, 'rb').read()) if 'internal' in f and 'visible' in f: history = f diff --git a/modules/grammar/grammar_utils.py b/modules/grammar/grammar_utils.py index 7f09ff82..af78f6b9 100644 --- a/modules/grammar/grammar_utils.py +++ b/modules/grammar/grammar_utils.py @@ -463,7 +463,7 @@ class IncrementalGrammarConstraint(GrammarConstraint): super().__init__(grammar_str, start_rule_name, tokenizer) def accept_char(self, char, stacks): - byte = ord(char) + byte = char if isinstance(char, int) else ord(char) new_stacks = [] for stack in stacks: # stack is empty @@ -549,7 +549,7 @@ class IncrementalGrammarConstraint(GrammarConstraint): # For each sub-rule in the grammar, cache whether each byte is accepted. @lru_cache(maxsize=None) def pos_char_acceptance(self, pos, char): - byte = ord(char) + byte = char if isinstance(char, int) else ord(char) num_chars = self.grammar_encoding[pos] pos += 1 for i in range(0, num_chars, 2): diff --git a/modules/html_generator.py b/modules/html_generator.py index 6844c244..79237f7f 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -116,29 +116,60 @@ def extract_thinking_block(string): THINK_START_TAG = "<think>" THINK_END_TAG = "</think>" - # Look for think tag + # Look for think tag first start_pos = string.find(THINK_START_TAG) end_pos = string.find(THINK_END_TAG) - # Return if neither tag is in string - if start_pos == -1 and end_pos == -1: - return None, string + # If think tags found, use existing logic + if start_pos != -1 or end_pos != -1: + # handle missing start or end tags + if start_pos == -1: + thought_start = 0 + else: + thought_start = start_pos + len(THINK_START_TAG) + if end_pos == -1: + thought_end = len(string) + content_start = len(string) + else: + thought_end = end_pos + content_start = end_pos + len(THINK_END_TAG) + thinking_content = string[thought_start:thought_end] + remaining_content = string[content_start:] + return thinking_content, remaining_content - # handle missing start or end tags - if start_pos == -1: - thought_start = 0 - else: - thought_start = start_pos + len(THINK_START_TAG) - if end_pos == -1: - thought_end = len(string) - content_start = len(string) - else: - thought_end = end_pos - content_start = end_pos + len(THINK_END_TAG) + # If think tags not found, try alternative format + ALT_START = "<|channel|>analysis<|message|>" + ALT_END = "<|end|>" + ALT_CONTENT_START = "<|start|>assistant<|channel|>final<|message|>" - thinking_content = string[thought_start:thought_end] - remaining_content = string[content_start:] - return thinking_content, remaining_content + alt_start_pos = string.find(ALT_START) + alt_end_pos = string.find(ALT_END) + alt_content_pos = string.find(ALT_CONTENT_START) + + if alt_start_pos != -1 or alt_end_pos != -1: + if alt_start_pos == -1: + thought_start = 0 + else: + thought_start = alt_start_pos + len(ALT_START) + + # If no explicit end tag but content start exists, use content start as end + if alt_end_pos == -1: + if alt_content_pos != -1: + thought_end = alt_content_pos + content_start = alt_content_pos + len(ALT_CONTENT_START) + else: + thought_end = len(string) + content_start = len(string) + else: + thought_end = alt_end_pos + content_start = alt_content_pos + len(ALT_CONTENT_START) if alt_content_pos != -1 else alt_end_pos + len(ALT_END) + + thinking_content = string[thought_start:thought_end] + remaining_content = string[content_start:] + return thinking_content, remaining_content + + # Return if neither format is found + return None, string @functools.lru_cache(maxsize=None) diff --git a/modules/loaders.py b/modules/loaders.py index c9c27bd5..251cb4e1 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -140,6 +140,7 @@ def transformers_samplers(): 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'static_cache', 'seed', @@ -192,6 +193,7 @@ loaders_samplers = { 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'seed', 'sampler_priority', @@ -239,6 +241,7 @@ loaders_samplers = { 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'seed', 'sampler_priority', @@ -278,6 +281,7 @@ loaders_samplers = { 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'seed', 'custom_token_bans', @@ -311,6 +315,7 @@ loaders_samplers = { 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'seed', 'sampler_priority', 'dry_sequence_breakers', diff --git a/modules/models_settings.py b/modules/models_settings.py index f5b3add9..f336e45b 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -90,8 +90,10 @@ def get_model_metadata(model): template = template.replace('eos_token', "'{}'".format(eos_token)) template = template.replace('bos_token', "'{}'".format(bos_token)) + template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) template = re.sub(r'raise_exception\([^)]*\)', "''", template) template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) + template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template @@ -122,13 +124,25 @@ def get_model_metadata(model): # Try to find the Jinja instruct template path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json' + template = None + + # 1. Prioritize reading from chat_template.jinja if it exists + jinja_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.jinja' + if jinja_path.exists(): + with open(jinja_path, 'r', encoding='utf-8') as f: + template = f.read() + if path.exists(): metadata = json.loads(open(path, 'r', encoding='utf-8').read()) - if 'chat_template' in metadata: + + # 2. Only read from metadata if we haven't already loaded from .jinja + if template is None and 'chat_template' in metadata: template = metadata['chat_template'] if isinstance(template, list): template = template[0]['template'] + # 3. If a template was found from either source, process it + if template: for k in ['eos_token', 'bos_token']: if k in metadata: value = metadata[k] @@ -137,8 +151,10 @@ def get_model_metadata(model): template = template.replace(k, "'{}'".format(value)) + template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL) template = re.sub(r'raise_exception\([^)]*\)', "''", template) template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) + template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template diff --git a/modules/shared.py b/modules/shared.py index 5e3e11c0..ab5198d1 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -211,6 +211,7 @@ settings = { 'ban_eos_token': False, 'add_bos_token': True, 'enable_thinking': True, + 'reasoning_effort': 'medium', 'skip_special_tokens': True, 'stream': True, 'static_cache': False, diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py index 2f7367a4..e4072125 100644 --- a/modules/transformers_loader.py +++ b/modules/transformers_loader.py @@ -136,7 +136,6 @@ def load_model_HF(model_name): path_to_model = Path(f'{shared.args.model_dir}/{model_name}') params = { 'low_cpu_mem_usage': True, - 'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16, 'attn_implementation': shared.args.attn_implementation, } diff --git a/modules/ui.py b/modules/ui.py index 98acc038..e7805046 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -215,6 +215,7 @@ def list_interface_input_elements(): 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'stream', 'static_cache', @@ -482,6 +483,7 @@ def setup_auto_save(): 'ban_eos_token', 'add_bos_token', 'enable_thinking', + 'reasoning_effort', 'skip_special_tokens', 'stream', 'static_cache', diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 4dade176..1d85a398 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -78,7 +78,8 @@ def create_ui(): with gr.Row(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) - shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle mode.') + shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.') + shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.') shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search') with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']: shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 687f1f5a..f17cae8a 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 8224d987..51f4571f 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 22141a8a..37021c77 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 04325464..f54ae191 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb @@ -33,7 +33,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index bba5804e..7d7925b7 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 8a84e403..72847534 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 9488f5e7..ed641a24 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb @@ -33,5 +33,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index a2af5108..d7fe735b 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index 948a275a..cb71f74b 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.3.1.post19; platform_system == "Windows" tqdm wandb @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 8f7106e4..d6bed576 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb @@ -34,8 +34,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 69a82184..cd85a744 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.53.* +transformers==4.55.* triton-windows==3.2.0.post19; platform_system == "Windows" tqdm wandb diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 53479a80..1f17dc50 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index d7336d2f..82254842 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 2c2296f2..fdf1632b 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -19,6 +19,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 04c9b283..833e923b 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 3c3563d3..6a894d49 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index cf0d7b11..0afb19c2 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 9bd8a37c..a404f50c 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index b8519553..75176656 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"