Merge branch 'oobabooga:main' into main

This commit is contained in:
SB Yoon 2025-08-08 23:00:15 -06:00 committed by GitHub
commit fe0bef40d2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
29 changed files with 237 additions and 85 deletions

View file

@ -43,6 +43,7 @@ class GenerationOptions(BaseModel):
ban_eos_token: bool = False
add_bos_token: bool = True
enable_thinking: bool = True
reasoning_effort: str = "medium"
skip_special_tokens: bool = True
static_cache: bool = False
truncation_length: int = 0

View file

@ -175,7 +175,8 @@ def generate_chat_prompt(user_input, state, **kwargs):
builtin_tools=None,
tools=state['tools'] if 'tools' in state else None,
tools_in_user_message=False,
add_generation_prompt=False
add_generation_prompt=False,
reasoning_effort=state['reasoning_effort']
)
chat_renderer = partial(
@ -210,7 +211,57 @@ def generate_chat_prompt(user_input, state, **kwargs):
messages.insert(insert_pos, {"role": "tool", "content": tool_msg})
if assistant_msg:
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
# Handle GPT-OSS as a special case
if '<|channel|>analysis<|message|>' in assistant_msg or '<|channel|>final<|message|>' in assistant_msg:
thinking_content = ""
final_content = ""
# Extract analysis content if present
if '<|channel|>analysis<|message|>' in assistant_msg:
# Split the message by the analysis tag to isolate the content that follows
parts = assistant_msg.split('<|channel|>analysis<|message|>', 1)
if len(parts) > 1:
# The content is everything after the tag
potential_content = parts[1]
# Now, find the end of this content block
analysis_end_tag = '<|end|>'
if analysis_end_tag in potential_content:
thinking_content = potential_content.split(analysis_end_tag, 1)[0].strip()
else:
# Fallback: if no <|end|> tag, stop at the start of the final channel if it exists
final_channel_tag = '<|channel|>final<|message|>'
if final_channel_tag in potential_content:
thinking_content = potential_content.split(final_channel_tag, 1)[0].strip()
else:
thinking_content = potential_content.strip()
# Extract final content if present
final_tag_to_find = '<|channel|>final<|message|>'
if final_tag_to_find in assistant_msg:
# Split the message by the final tag to isolate the content that follows
parts = assistant_msg.split(final_tag_to_find, 1)
if len(parts) > 1:
# The content is everything after the tag
potential_content = parts[1]
# Now, find the end of this content block
final_end_tag = '<|end|>'
if final_end_tag in potential_content:
final_content = potential_content.split(final_end_tag, 1)[0].strip()
else:
final_content = potential_content.strip()
# Insert as structured message
msg_dict = {"role": "assistant", "content": final_content}
if '<|channel|>analysis<|message|>' in assistant_msg:
msg_dict["thinking"] = thinking_content
messages.insert(insert_pos, msg_dict)
else:
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
# Check for user message attachments in metadata
@ -295,18 +346,44 @@ def generate_chat_prompt(user_input, state, **kwargs):
if len(suffix) > 0:
prompt = prompt[:-len(suffix)]
else:
if _continue:
suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
if len(suffix) > 0:
prompt = prompt[:-len(suffix)]
# Handle GPT-OSS as a special case when continuing
if _continue and '<|channel|>final<|message|>' in state['instruction_template_str']:
last_message_to_continue = messages[-1]
prompt = renderer(messages=messages[:-1])
# Start the assistant turn wrapper
assistant_reply_so_far = "<|start|>assistant"
if 'thinking' in last_message_to_continue:
assistant_reply_so_far += f"<|channel|>analysis<|message|>{last_message_to_continue['thinking']}<|end|>"
assistant_reply_so_far += f"<|channel|>final<|message|>{last_message_to_continue.get('content', '')}"
prompt += assistant_reply_so_far
else:
prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
if state['mode'] == 'chat' and not impersonate:
prefix = apply_extensions('bot_prefix', prefix, state)
prompt = renderer(messages=messages)
if _continue:
suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
if len(suffix) > 0:
prompt = prompt[:-len(suffix)]
else:
prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
prompt += prefix
# Handle GPT-OSS as a special case when not continuing
if '<|channel|>final<|message|>' in state['instruction_template_str']:
if prefix.endswith("<|channel|>final<|message|>"):
prefix = prefix[:-len("<|channel|>final<|message|>")]
if state['mode'] == 'instruct' and not any((_continue, impersonate, state['enable_thinking'])):
if impersonate:
prefix += "<|message|>"
if state['mode'] == 'chat' and not impersonate:
prefix = apply_extensions('bot_prefix', prefix, state)
prompt += prefix
if state['mode'] == 'instruct' and 'enable_thinking' in state['instruction_template_str'] and not any((_continue, impersonate, state['enable_thinking'])):
prompt += get_thinking_suppression_string(instruction_template)
return prompt
@ -459,6 +536,12 @@ def get_stopping_strings(state):
result = [item for item in stopping_strings if not any(item.startswith(other) and item != other for other in stopping_strings)]
result = list(set(result))
# Handle GPT-OSS as a special case
if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
result.remove("<|end|>")
result.append("<|result|>")
result = list(set(result))
if shared.args.verbose:
logger.info("STOPPING_STRINGS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(result)
@ -611,9 +694,9 @@ def generate_search_query(user_message, state):
# Use a minimal state for search query generation but keep the full history
search_state = state.copy()
search_state['max_new_tokens'] = 64
search_state['auto_max_new_tokens'] = False
search_state['auto_max_new_tokens'] = True
search_state['enable_thinking'] = False
search_state['reasoning_effort'] = 'low'
search_state['start_with'] = ""
# Generate the full prompt using existing history + augmented message
@ -623,6 +706,12 @@ def generate_search_query(user_message, state):
for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True):
query = reply
# Check for thinking block delimiters and extract content after them
if "</think>" in query:
query = query.rsplit("</think>", 1)[1]
elif "<|start|>assistant<|channel|>final<|message|>" in query:
query = query.rsplit("<|start|>assistant<|channel|>final<|message|>", 1)[1]
# Strip and remove surrounding quotes if present
query = query.strip()
if len(query) >= 2 and query.startswith('"') and query.endswith('"'):
@ -643,6 +732,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
output = apply_extensions('history', output)
state = apply_extensions('state', state)
# Handle GPT-OSS as a special case
if '<|channel|>final<|message|>' in state['instruction_template_str']:
state['skip_special_tokens'] = False
# Let the jinja2 template handle the BOS token
if state['mode'] in ['instruct', 'chat-instruct']:
state['add_bos_token'] = False
@ -1175,6 +1268,9 @@ def save_last_chat_state(character, mode, unique_id):
def load_history(unique_id, character, mode):
p = get_history_file_path(unique_id, character, mode)
if not p.exists():
return {'internal': [], 'visible': [], 'metadata': {}}
f = json.loads(open(p, 'rb').read())
if 'internal' in f and 'visible' in f:
history = f

View file

@ -463,7 +463,7 @@ class IncrementalGrammarConstraint(GrammarConstraint):
super().__init__(grammar_str, start_rule_name, tokenizer)
def accept_char(self, char, stacks):
byte = ord(char)
byte = char if isinstance(char, int) else ord(char)
new_stacks = []
for stack in stacks:
# stack is empty
@ -549,7 +549,7 @@ class IncrementalGrammarConstraint(GrammarConstraint):
# For each sub-rule in the grammar, cache whether each byte is accepted.
@lru_cache(maxsize=None)
def pos_char_acceptance(self, pos, char):
byte = ord(char)
byte = char if isinstance(char, int) else ord(char)
num_chars = self.grammar_encoding[pos]
pos += 1
for i in range(0, num_chars, 2):

View file

@ -116,29 +116,60 @@ def extract_thinking_block(string):
THINK_START_TAG = "&lt;think&gt;"
THINK_END_TAG = "&lt;/think&gt;"
# Look for think tag
# Look for think tag first
start_pos = string.find(THINK_START_TAG)
end_pos = string.find(THINK_END_TAG)
# Return if neither tag is in string
if start_pos == -1 and end_pos == -1:
return None, string
# If think tags found, use existing logic
if start_pos != -1 or end_pos != -1:
# handle missing start or end tags
if start_pos == -1:
thought_start = 0
else:
thought_start = start_pos + len(THINK_START_TAG)
if end_pos == -1:
thought_end = len(string)
content_start = len(string)
else:
thought_end = end_pos
content_start = end_pos + len(THINK_END_TAG)
thinking_content = string[thought_start:thought_end]
remaining_content = string[content_start:]
return thinking_content, remaining_content
# handle missing start or end tags
if start_pos == -1:
thought_start = 0
else:
thought_start = start_pos + len(THINK_START_TAG)
if end_pos == -1:
thought_end = len(string)
content_start = len(string)
else:
thought_end = end_pos
content_start = end_pos + len(THINK_END_TAG)
# If think tags not found, try alternative format
ALT_START = "&lt;|channel|&gt;analysis&lt;|message|&gt;"
ALT_END = "&lt;|end|&gt;"
ALT_CONTENT_START = "&lt;|start|&gt;assistant&lt;|channel|&gt;final&lt;|message|&gt;"
thinking_content = string[thought_start:thought_end]
remaining_content = string[content_start:]
return thinking_content, remaining_content
alt_start_pos = string.find(ALT_START)
alt_end_pos = string.find(ALT_END)
alt_content_pos = string.find(ALT_CONTENT_START)
if alt_start_pos != -1 or alt_end_pos != -1:
if alt_start_pos == -1:
thought_start = 0
else:
thought_start = alt_start_pos + len(ALT_START)
# If no explicit end tag but content start exists, use content start as end
if alt_end_pos == -1:
if alt_content_pos != -1:
thought_end = alt_content_pos
content_start = alt_content_pos + len(ALT_CONTENT_START)
else:
thought_end = len(string)
content_start = len(string)
else:
thought_end = alt_end_pos
content_start = alt_content_pos + len(ALT_CONTENT_START) if alt_content_pos != -1 else alt_end_pos + len(ALT_END)
thinking_content = string[thought_start:thought_end]
remaining_content = string[content_start:]
return thinking_content, remaining_content
# Return if neither format is found
return None, string
@functools.lru_cache(maxsize=None)

View file

@ -140,6 +140,7 @@ def transformers_samplers():
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'reasoning_effort',
'skip_special_tokens',
'static_cache',
'seed',
@ -192,6 +193,7 @@ loaders_samplers = {
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'reasoning_effort',
'skip_special_tokens',
'seed',
'sampler_priority',
@ -239,6 +241,7 @@ loaders_samplers = {
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'reasoning_effort',
'skip_special_tokens',
'seed',
'sampler_priority',
@ -278,6 +281,7 @@ loaders_samplers = {
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'reasoning_effort',
'skip_special_tokens',
'seed',
'custom_token_bans',
@ -311,6 +315,7 @@ loaders_samplers = {
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'reasoning_effort',
'seed',
'sampler_priority',
'dry_sequence_breakers',

View file

@ -90,8 +90,10 @@ def get_model_metadata(model):
template = template.replace('eos_token', "'{}'".format(eos_token))
template = template.replace('bos_token', "'{}'".format(bos_token))
template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
template = re.sub(r'raise_exception\([^)]*\)', "''", template)
template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS
model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
model_settings['instruction_template_str'] = template
@ -122,13 +124,25 @@ def get_model_metadata(model):
# Try to find the Jinja instruct template
path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
template = None
# 1. Prioritize reading from chat_template.jinja if it exists
jinja_path = Path(f'{shared.args.model_dir}/{model}') / 'chat_template.jinja'
if jinja_path.exists():
with open(jinja_path, 'r', encoding='utf-8') as f:
template = f.read()
if path.exists():
metadata = json.loads(open(path, 'r', encoding='utf-8').read())
if 'chat_template' in metadata:
# 2. Only read from metadata if we haven't already loaded from .jinja
if template is None and 'chat_template' in metadata:
template = metadata['chat_template']
if isinstance(template, list):
template = template[0]['template']
# 3. If a template was found from either source, process it
if template:
for k in ['eos_token', 'bos_token']:
if k in metadata:
value = metadata[k]
@ -137,8 +151,10 @@ def get_model_metadata(model):
template = template.replace(k, "'{}'".format(value))
template = re.sub(r"\{\{-?\s*raise_exception\(.*?\)\s*-?\}\}", "", template, flags=re.DOTALL)
template = re.sub(r'raise_exception\([^)]*\)', "''", template)
template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
template = re.sub(r'elif loop\.last and not add_generation_prompt', 'elif False', template) # Handle GPT-OSS
model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
model_settings['instruction_template_str'] = template

View file

@ -211,6 +211,7 @@ settings = {
'ban_eos_token': False,
'add_bos_token': True,
'enable_thinking': True,
'reasoning_effort': 'medium',
'skip_special_tokens': True,
'stream': True,
'static_cache': False,

View file

@ -136,7 +136,6 @@ def load_model_HF(model_name):
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
params = {
'low_cpu_mem_usage': True,
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
'attn_implementation': shared.args.attn_implementation,
}

View file

@ -215,6 +215,7 @@ def list_interface_input_elements():
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'reasoning_effort',
'skip_special_tokens',
'stream',
'static_cache',
@ -482,6 +483,7 @@ def setup_auto_save():
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'reasoning_effort',
'skip_special_tokens',
'stream',
'static_cache',

View file

@ -78,7 +78,8 @@ def create_ui():
with gr.Row():
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle <think> mode.')
shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by pre-2507 Qwen3.')
shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search')
with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)

View file

@ -23,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.2.0.post19; platform_system == "Windows"
tqdm
wandb
@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -22,7 +22,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.2.0.post19; platform_system == "Windows"
tqdm
wandb
@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -22,7 +22,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.2.0.post19; platform_system == "Windows"
tqdm
wandb
@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -22,7 +22,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.2.0.post19; platform_system == "Windows"
tqdm
wandb
@ -33,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl

View file

@ -23,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.2.0.post19; platform_system == "Windows"
tqdm
wandb
@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2-py3-none-any.whl

View file

@ -22,7 +22,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.2.0.post19; platform_system == "Windows"
tqdm
wandb
@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -22,7 +22,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.2.0.post19; platform_system == "Windows"
tqdm
wandb
@ -33,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -23,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.3.1.post19; platform_system == "Windows"
tqdm
wandb
@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -23,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.3.1.post19; platform_system == "Windows"
tqdm
wandb
@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.7.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -23,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.2.0.post19; platform_system == "Windows"
tqdm
wandb
@ -34,8 +34,8 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.5/exllamav3-0.0.5+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -22,7 +22,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
transformers==4.53.*
transformers==4.55.*
triton-windows==3.2.0.post19; platform_system == "Windows"
tqdm
wandb

View file

@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"

View file

@ -19,6 +19,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"

View file

@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -18,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.31.0/llama_cpp_binaries-0.31.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.33.0/llama_cpp_binaries-0.33.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"