From f0c16813ef11a8ed39db3586614c06239ef25807 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 14 Mar 2026 19:35:12 -0700
Subject: [PATCH] Remove the rope scaling parameters

Now models have 131k+ context length. The parameters can still be
passed to llama.cpp through --extra-flags.
---
 README.md                      | 26 ++++++++++----------------
 docs/04 - Model Tab.md         |  3 ---
 modules/llama_cpp_server.py    |  4 ----
 modules/loaders.py             |  7 -------
 modules/models_settings.py     | 22 ----------------------
 modules/shared.py              |  6 ------
 modules/transformers_loader.py |  7 -------
 modules/ui_model_menu.py       |  3 ---
 8 files changed, 10 insertions(+), 68 deletions(-)

diff --git a/README.md b/README.md
index 9a8e0a86..f1527176 100644
--- a/README.md
+++ b/README.md
@@ -244,15 +244,14 @@ usage: server.py [-h] [--user-data-dir USER_DATA_DIR] [--multi-user] [--model MO
                  [--row-split] [--no-mmap] [--mlock] [--no-kv-offload] [--batch-size BATCH_SIZE] [--ubatch-size UBATCH_SIZE] [--threads THREADS] [--threads-batch THREADS_BATCH] [--numa]
                  [--parallel PARALLEL] [--fit-target FIT_TARGET] [--extra-flags EXTRA_FLAGS] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16]
                  [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--attn-implementation IMPLEMENTATION] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE]
-                 [--quant_type QUANT_TYPE] [--gpu-split GPU_SPLIT] [--enable-tp] [--tp-backend TP_BACKEND] [--cfg-cache] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
-                 [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
-                 [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api]
-                 [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] [--temperature N]
-                 [--dynatemp-low N] [--dynatemp-high N] [--dynatemp-exponent N] [--smoothing-factor N] [--smoothing-curve N] [--min-p N] [--top-p N] [--top-k N] [--typical-p N] [--xtc-threshold N]
-                 [--xtc-probability N] [--epsilon-cutoff N] [--eta-cutoff N] [--tfs N] [--top-a N] [--top-n-sigma N] [--adaptive-target N] [--adaptive-decay N] [--dry-multiplier N]
-                 [--dry-allowed-length N] [--dry-base N] [--repetition-penalty N] [--frequency-penalty N] [--presence-penalty N] [--encoder-repetition-penalty N] [--no-repeat-ngram-size N]
-                 [--repetition-penalty-range N] [--penalty-alpha N] [--guidance-scale N] [--mirostat-mode N] [--mirostat-tau N] [--mirostat-eta N] [--do-sample | --no-do-sample]
-                 [--dynamic-temperature | --no-dynamic-temperature] [--temperature-last | --no-temperature-last] [--sampler-priority N] [--dry-sequence-breakers N]
+                 [--quant_type QUANT_TYPE] [--gpu-split GPU_SPLIT] [--enable-tp] [--tp-backend TP_BACKEND] [--cfg-cache] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share]
+                 [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors]
+                 [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4]
+                 [--nowebui] [--temperature N] [--dynatemp-low N] [--dynatemp-high N] [--dynatemp-exponent N] [--smoothing-factor N] [--smoothing-curve N] [--min-p N] [--top-p N] [--top-k N]
+                 [--typical-p N] [--xtc-threshold N] [--xtc-probability N] [--epsilon-cutoff N] [--eta-cutoff N] [--tfs N] [--top-a N] [--top-n-sigma N] [--adaptive-target N] [--adaptive-decay N]
+                 [--dry-multiplier N] [--dry-allowed-length N] [--dry-base N] [--repetition-penalty N] [--frequency-penalty N] [--presence-penalty N] [--encoder-repetition-penalty N]
+                 [--no-repeat-ngram-size N] [--repetition-penalty-range N] [--penalty-alpha N] [--guidance-scale N] [--mirostat-mode N] [--mirostat-tau N] [--mirostat-eta N]
+                 [--do-sample | --no-do-sample] [--dynamic-temperature | --no-dynamic-temperature] [--temperature-last | --no-temperature-last] [--sampler-priority N] [--dry-sequence-breakers N]
                  [--enable-thinking | --no-enable-thinking] [--reasoning-effort N] [--chat-template-file CHAT_TEMPLATE_FILE]
 
 Text Generation Web UI
@@ -262,7 +261,7 @@ options:
 
 Basic settings:
   --user-data-dir USER_DATA_DIR                        Path to the user data directory. Default: auto-detected.
-  --multi-user                                         Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.
+  --multi-user                                         Multi-user mode. Chat histories are not saved or automatically loaded. Best suited for small trusted teams.
   --model MODEL                                        Name of the model to load by default.
   --lora LORA [LORA ...]                               The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
   --model-dir MODEL_DIR                                Path to directory with all the models.
@@ -289,7 +288,7 @@ Model loader:
                                                        LLM.
 
 Context and cache:
-  --ctx-size, --n_ctx, --max_seq_len N                 Context size in tokens. llama.cpp: 0 = auto if gpu-layers is also -1.
+  --ctx-size, --n_ctx, --max_seq_len N                 Context size in tokens. 0 = auto for llama.cpp (requires gpu-layers=-1), 8192 for other loaders.
   --cache-type, --cache_type N                         KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).
 
 Speculative decoding:
@@ -350,11 +349,6 @@ ExLlamaV3:
   --tp-backend TP_BACKEND                              The backend for tensor parallelism. Valid options: native, nccl. Default: native.
   --cfg-cache                                          Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
 
-RoPE:
-  --alpha_value ALPHA_VALUE                            Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
-  --rope_freq_base ROPE_FREQ_BASE                      If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
-  --compress_pos_emb COMPRESS_POS_EMB                  Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
-
 Gradio:
   --listen                                             Make the web UI reachable from your local network.
   --listen-port LISTEN_PORT                            The listening port that the server will use.
diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md
index 4d5ae645..744970ac 100644
--- a/docs/04 - Model Tab.md	
+++ b/docs/04 - Model Tab.md	
@@ -41,9 +41,6 @@ Options:
 * **cpu_memory**: Maximum CPU memory in GiB to use for CPU offloading via the accelerate library. Whatever doesn't fit in the GPU or CPU will go to a disk cache if the "disk" checkbox is enabled.
 * **compute_dtype**: Used when "load_in_4bit" is checked. I recommend leaving the default value.
 * **quant_type**: Used when "load_in_4bit" is checked. I recommend leaving the default value.
-* **alpha_value**: Used to extend the context length of a model with a minor loss in quality. I have measured 1.75 to be optimal for 1.5x context, and 2.5 for 2x context. That is, with alpha = 2.5 you can make a model with 4096 context length go to 8192 context length.
-* **rope_freq_base**: Originally another way to write "alpha_value", it ended up becoming a necessary parameter for some models like CodeLlama, which was fine-tuned with this set to 1000000 and hence needs to be loaded with it set to 1000000 as well.
-* **compress_pos_emb**: The first and original context-length extension method, discovered by [kaiokendev](https://kaiokendev.github.io/til). When set to 2, the context length is doubled, 3 and it's tripled, etc. It should only be used for models that have been fine-tuned with this parameter set to different than 1. For models that have not been tuned to have greater context length, alpha_value will lead to a smaller accuracy loss.
 * **attn_implementation**: Choose the attention implementation. Valid options: `sdpa`, `eager`, `flash_attention_2`. The default (`sdpa`) works well in most cases; `flash_attention_2` may be useful for training.
 * **cpu**: Loads the model in CPU mode using Pytorch. The model will be loaded in 32-bit precision, so a lot of RAM will be used. CPU inference with transformers is older than llama.cpp and it works, but it's a lot slower. Note: this parameter has a different interpretation in the llama.cpp loader (see above).
 * **load_in_8bit**: Load the model in 8-bit precision using bitsandbytes. The 8-bit kernel in that library has been optimized for training and not inference, so load_in_8bit is slower than load_in_4bit (but more accurate).
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index fc8e9a19..05c07748 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -403,10 +403,6 @@ class LlamaServer:
         if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
             cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
             cache_type = shared.args.cache_type
-        if shared.args.compress_pos_emb != 1:
-            cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
-        if shared.args.rope_freq_base > 0:
-            cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
         if shared.args.mmproj not in [None, 'None']:
             path = Path(shared.args.mmproj)
             if not path.exists():
diff --git a/modules/loaders.py b/modules/loaders.py
index d2ebdbc3..c90f2ebb 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -15,8 +15,6 @@ loaders_and_params = OrderedDict({
         'tensor_split',
         'extra_flags',
         'streaming_llm',
-        'rope_freq_base',
-        'compress_pos_emb',
         'row_split',
         'no_kv_offload',
         'no_mmap',
@@ -41,8 +39,6 @@ loaders_and_params = OrderedDict({
     'Transformers': [
         'gpu_split',
         'cpu_memory',
-        'alpha_value',
-        'compress_pos_emb',
         'compute_dtype',
         'quant_type',
         'load_in_8bit',
@@ -320,9 +316,6 @@ def list_model_elements():
         'extra_flags',
         'streaming_llm',
         'gpu_split',
-        'alpha_value',
-        'rope_freq_base',
-        'compress_pos_emb',
         'compute_dtype',
         'quant_type',
         'load_in_8bit',
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 25a35237..f3c9a986 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -15,9 +15,6 @@ def get_fallback_settings():
     return {
         'bf16': False,
         'ctx_size': 8192,
-        'rope_freq_base': 0,
-        'compress_pos_emb': 1,
-        'alpha_value': 1,
         'truncation_length': shared.settings['truncation_length'],
         'truncation_length_info': shared.settings['truncation_length'],
         'skip_special_tokens': shared.settings['skip_special_tokens'],
@@ -69,12 +66,6 @@ def get_model_metadata(model):
             if k.endswith('.context_length'):
                 model_settings['ctx_size'] = 0
                 model_settings['truncation_length_info'] = metadata[k]
-            elif k.endswith('rope.freq_base'):
-                model_settings['rope_freq_base'] = metadata[k]
-            elif k.endswith('rope.scale_linear'):
-                model_settings['compress_pos_emb'] = metadata[k]
-            elif k.endswith('rope.scaling.factor'):
-                model_settings['compress_pos_emb'] = metadata[k]
             elif k.endswith('.block_count'):
                 model_settings['gpu_layers'] = -1
                 model_settings['max_gpu_layers'] = metadata[k] + 1
@@ -119,15 +110,6 @@ def get_model_metadata(model):
                 model_settings['ctx_size'] = min(value, 8192)
                 break
 
-            if 'rope_theta' in metadata:
-                model_settings['rope_freq_base'] = metadata['rope_theta']
-            elif 'attn_config' in metadata and 'rope_theta' in metadata['attn_config']:
-                model_settings['rope_freq_base'] = metadata['attn_config']['rope_theta']
-
-            if 'rope_scaling' in metadata and isinstance(metadata['rope_scaling'], dict) and all(key in metadata['rope_scaling'] for key in ('type', 'factor')):
-                if metadata['rope_scaling']['type'] == 'linear':
-                    model_settings['compress_pos_emb'] = metadata['rope_scaling']['factor']
-
             if 'torch_dtype' in metadata and metadata['torch_dtype'] == 'bfloat16':
                 model_settings['bf16'] = True
 
@@ -181,10 +163,6 @@ def get_model_metadata(model):
     if 'instruction_template' not in model_settings:
         model_settings['instruction_template'] = 'Alpaca'
 
-    # Ignore rope_freq_base if set to the default value
-    if 'rope_freq_base' in model_settings and model_settings['rope_freq_base'] == 10000:
-        model_settings.pop('rope_freq_base')
-
     # Apply user settings from user_data/models/config-user.yaml
     settings = shared.user_config
     for pat in settings:
diff --git a/modules/shared.py b/modules/shared.py
index 475d57b7..354f7589 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -139,12 +139,6 @@ group.add_argument('--enable-tp', '--enable_tp', action='store_true', help='Enab
 group.add_argument('--tp-backend', type=str, default='native', help='The backend for tensor parallelism. Valid options: native, nccl. Default: native.')
 group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
 
-# RoPE
-group = parser.add_argument_group('RoPE')
-group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
-group.add_argument('--rope_freq_base', type=int, default=0, help='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).')
-group.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
-
 # Gradio
 group = parser.add_argument_group('Gradio')
 group.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
diff --git a/modules/transformers_loader.py b/modules/transformers_loader.py
index b9918764..63758ad7 100644
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@@ -136,8 +136,6 @@ def load_model_HF(model_name):
         shared.args.load_in_4bit,
         shared.args.disk,
         shared.args.cpu_memory is not None,
-        shared.args.compress_pos_emb > 1,
-        shared.args.alpha_value > 1,
     ])
 
     # Load the model without any special settings
@@ -200,11 +198,6 @@ def load_model_HF(model_name):
             if shared.args.disk:
                 params['offload_folder'] = str(Path(shared.args.disk_cache_dir))
 
-        if shared.args.compress_pos_emb > 1:
-            params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
-        elif shared.args.alpha_value > 1:
-            params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
-
         logger.info("TRANSFORMERS_PARAMS=")
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
         print()
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index b53bc292..08fdc83e 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -100,9 +100,6 @@ def create_ui():
                                 shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                                 shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
                                 shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
-                                shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
-                                shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
-                                shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
                                 shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
                                 shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')