diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 27890d8c..47d9d27c 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -326,6 +326,8 @@ class LlamaServer:
             cmd += ["--threads", str(shared.args.threads)]
         if shared.args.threads_batch > 0:
             cmd += ["--threads-batch", str(shared.args.threads_batch)]
+        if shared.args.cpu_moe:
+            cmd.append("--cpu-moe")
         if shared.args.no_mmap:
             cmd.append("--no-mmap")
         if shared.args.mlock:
diff --git a/modules/loaders.py b/modules/loaders.py
index 609a54c6..0f0f6d1e 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -6,6 +6,7 @@ import gradio as gr
 loaders_and_params = OrderedDict({
     'llama.cpp': [
         'gpu_layers',
+        'cpu_moe',
         'threads',
         'threads_batch',
         'batch_size',
diff --git a/modules/shared.py b/modules/shared.py
index e54ba654..1cca1233 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -66,6 +66,7 @@ group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the pr
 # llama.cpp
 group = parser.add_argument_group('llama.cpp')
 group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
+group.add_argument('--cpu-moe', action='store_true', help='Move the experts to the CPU (for MoE models).')
 group.add_argument('--mmproj', type=str, default=None, help='Path to the mmproj file for vision models.')
 group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
 group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
diff --git a/modules/ui.py b/modules/ui.py
index 76533767..d8dcedfb 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -125,6 +125,7 @@ def list_model_elements():
         'loader',
         'cpu_memory',
         'gpu_layers',
+        'cpu_moe',
         'threads',
         'threads_batch',
         'batch_size',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 50ada9f9..31ab929f 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -50,6 +50,7 @@ def create_ui():
 
                         with gr.Column():
                             shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
+                            shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
@@ -94,7 +95,7 @@ def create_ui():
                                 shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
 
                             with gr.Column():
-                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
+                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='Use PyTorch in CPU mode.')
                                 shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                                 shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
                                 shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')