shared.gradio['n_gpu_layers']=gr.Slider(label="n-gpu-layers",minimum=0,maximum=256,value=shared.args.n_gpu_layers,info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['ctx_size']=gr.Number(label='ctx-size',precision=0,step=256,value=shared.args.ctx_size,info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
shared.gradio['cache_type']=gr.Dropdown(label="cache_type",choices=['fp16','q8_0','q4_0','fp8','q8','q7','q6','q5','q4','q3','q2'],value=shared.args.cache_type,allow_custom_value=True,info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
shared.gradio['extra_flags']=gr.Textbox(label='extra-flags',info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"',value=shared.args.extra_flags)
shared.gradio['alpha_value']=gr.Number(label='alpha_value',value=shared.args.alpha_value,precision=2,info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base']=gr.Number(label='rope_freq_base',value=shared.args.rope_freq_base,precision=0,info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb']=gr.Number(label='compress_pos_emb',value=shared.args.compress_pos_emb,precision=2,info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
shared.gradio['compute_dtype']=gr.Dropdown(label="compute_dtype",choices=["bfloat16","float16","float32"],value=shared.args.compute_dtype,info='Used by load-in-4bit.')
shared.gradio['quant_type']=gr.Dropdown(label="quant_type",choices=["nf4","fp4"],value=shared.args.quant_type,info='Used by load-in-4bit.')
shared.gradio['num_experts_per_token']=gr.Number(label="Number of experts per token",value=shared.args.num_experts_per_token,info='Only applies to MoE models like Mixtral.')
shared.gradio['torch_compile']=gr.Checkbox(label="torch-compile",value=shared.args.torch_compile,info='Compile the model with torch.compile for improved performance.')
shared.gradio['use_flash_attention_2']=gr.Checkbox(label="use_flash_attention_2",value=shared.args.use_flash_attention_2,info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['streaming_llm']=gr.Checkbox(label="streaming_llm",value=shared.args.streaming_llm,info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['cpu']=gr.Checkbox(label="cpu",value=shared.args.cpu,info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['row_split']=gr.Checkbox(label="row_split",value=shared.args.row_split,info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
shared.gradio['no_kv_offload']=gr.Checkbox(label="no_kv_offload",value=shared.args.no_kv_offload,info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['numa']=gr.Checkbox(label="numa",value=shared.args.numa,info='NUMA support can help on some systems with non-uniform memory access.')
shared.gradio['use_double_quant']=gr.Checkbox(label="use_double_quant",value=shared.args.use_double_quant,info='Used by load-in-4bit.')
shared.gradio['use_eager_attention']=gr.Checkbox(label="use_eager_attention",value=shared.args.use_eager_attention,info='Set attn_implementation= eager while loading the model.')
shared.gradio['autosplit']=gr.Checkbox(label="autosplit",value=shared.args.autosplit,info='Automatically split the model tensors across the available GPUs.')
shared.gradio['cpp_runner']=gr.Checkbox(label="cpp-runner",value=shared.args.cpp_runner,info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
shared.gradio['trust_remote_code']=gr.Checkbox(label="trust-remote-code",value=shared.args.trust_remote_code,info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.',interactive=shared.args.trust_remote_code)
shared.gradio['no_use_fast']=gr.Checkbox(label="no_use_fast",value=shared.args.no_use_fast,info='Set use_fast=False while loading the tokenizer.')
shared.gradio['exllamav2_info']=gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['tensorrt_llm_info']=gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
shared.gradio['model_draft']=gr.Dropdown(label="model-draft",choices=utils.get_available_models(),value=lambda:shared.args.model_draft,elem_classes='slim-dropdown',info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).',interactive=notmu)
shared.gradio['draft_max']=gr.Number(label="draft-max",precision=0,step=1,value=shared.args.draft_max,info='Number of tokens to draft for speculative decoding.')
shared.gradio['gpu_layers_draft']=gr.Slider(label="gpu-layers-draft",minimum=0,maximum=256,value=shared.args.gpu_layers_draft,info='Number of layers to offload to the GPU for the draft model.')
shared.gradio['device_draft']=gr.Textbox(label="device-draft",value=shared.args.device_draft,info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
shared.gradio['ctx_size_draft']=gr.Number(label="ctx-size-draft",precision=0,step=256,value=shared.args.ctx_size_draft,info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
shared.gradio['custom_model_menu']=gr.Textbox(label="Download model or LoRA",info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.",interactive=notmu)
shared.gradio['download_specific_file']=gr.Textbox(placeholder="File name (for GGUF models)",show_label=False,max_lines=1,interactive=notmu)
shared.gradio['customized_template']=gr.Dropdown(choices=utils.get_available_instruction_templates(),value='None',label='Select the desired instruction template',elem_classes='slim-dropdown')
gr.Markdown("This allows you to set a customized template for the model currently selected in the \"Model loader\" menu. Whenever the model gets loaded, this template will be used in place of the template specified in the model's medatada, which sometimes is wrong.")
yield"### {}\n\n- Settings updated: Click \"Load\" to load the model\n- Max sequence length: {}".format(selected_model,settings['truncation_length_info'])