mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-03-06 05:33:50 +01:00
llama.cpp: Reorganize speculative decoding UI and use recommended ngram-mod defaults
This commit is contained in:
parent
eb90daf098
commit
cdf0e392e6
|
|
@ -401,14 +401,10 @@ class LlamaServer:
|
|||
cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
|
||||
if shared.args.spec_type != 'none':
|
||||
cmd += ["--spec-type", shared.args.spec_type]
|
||||
if shared.args.draft_max > 0:
|
||||
cmd += ["--draft-max", str(shared.args.draft_max)]
|
||||
if shared.args.spec_ngram_size_n != 12:
|
||||
cmd += ["--spec-ngram-size-n", str(shared.args.spec_ngram_size_n)]
|
||||
if shared.args.spec_ngram_size_m != 48:
|
||||
cmd += ["--spec-ngram-size-m", str(shared.args.spec_ngram_size_m)]
|
||||
if shared.args.spec_ngram_min_hits != 1:
|
||||
cmd += ["--spec-ngram-min-hits", str(shared.args.spec_ngram_min_hits)]
|
||||
cmd += ["--draft-max", str(shared.args.draft_max)]
|
||||
cmd += ["--spec-ngram-size-n", str(shared.args.spec_ngram_size_n)]
|
||||
cmd += ["--spec-ngram-size-m", str(shared.args.spec_ngram_size_m)]
|
||||
cmd += ["--spec-ngram-min-hits", str(shared.args.spec_ngram_min_hits)]
|
||||
if shared.args.streaming_llm:
|
||||
cmd += ["--cache-reuse", "1"]
|
||||
cmd += ["--swa-full"]
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ loaders_and_params = OrderedDict({
|
|||
'gpu_layers_draft',
|
||||
'device_draft',
|
||||
'ctx_size_draft',
|
||||
'ngram_header',
|
||||
'spec_type',
|
||||
'spec_ngram_size_n',
|
||||
'spec_ngram_size_m',
|
||||
|
|
|
|||
|
|
@ -81,8 +81,8 @@ group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to
|
|||
group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of layers to offload to the GPU for the draft model.')
|
||||
group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
|
||||
group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
|
||||
group.add_argument('--spec-type', type=str, default='none', choices=['none', 'ngram-cache', 'ngram-simple', 'ngram-map-k', 'ngram-map-k4v', 'ngram-mod'], help='Speculative decoding type for draftless speculation.')
|
||||
group.add_argument('--spec-ngram-size-n', type=int, default=12, help='N-gram lookup size for ngram speculative decoding.')
|
||||
group.add_argument('--spec-type', type=str, default='none', choices=['none', 'ngram-mod', 'ngram-simple', 'ngram-map-k', 'ngram-map-k4v', 'ngram-cache'], help='Draftless speculative decoding type. Recommended: ngram-mod.')
|
||||
group.add_argument('--spec-ngram-size-n', type=int, default=24, help='N-gram lookup size for ngram speculative decoding.')
|
||||
group.add_argument('--spec-ngram-size-m', type=int, default=48, help='Draft n-gram size for ngram speculative decoding.')
|
||||
group.add_argument('--spec-ngram-min-hits', type=int, default=1, help='Minimum n-gram hits for ngram-map speculative decoding.')
|
||||
|
||||
|
|
|
|||
|
|
@ -68,15 +68,19 @@ def create_ui():
|
|||
|
||||
# Speculative decoding
|
||||
with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
|
||||
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Maximum number of tokens to draft for speculative decoding. Recommended: 4 for draft model, 64 for n-gram.')
|
||||
|
||||
gr.Markdown('#### Draft model')
|
||||
with gr.Row():
|
||||
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=['None'] + utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
|
||||
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=['None'] + utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Must share the same vocabulary as the main model.', interactive=not mu)
|
||||
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': ['None'] + utils.get_available_models()}, 'refresh-button', interactive=not mu)
|
||||
|
||||
shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
|
||||
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.')
|
||||
shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
|
||||
shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
|
||||
shared.gradio['spec_type'] = gr.Dropdown(label="spec-type", choices=['none', 'ngram-cache', 'ngram-simple', 'ngram-map-k', 'ngram-map-k4v', 'ngram-mod'], value=shared.args.spec_type, info='Draftless speculative decoding type. Uses n-gram matching from context.')
|
||||
|
||||
shared.gradio['ngram_header'] = gr.Markdown('#### N-gram (draftless)')
|
||||
shared.gradio['spec_type'] = gr.Dropdown(label="spec-type", choices=['none', 'ngram-mod', 'ngram-simple', 'ngram-map-k', 'ngram-map-k4v', 'ngram-cache'], value=shared.args.spec_type, info='Draftless speculative decoding type. Recommended: ngram-mod.')
|
||||
shared.gradio['spec_ngram_size_n'] = gr.Number(label="spec-ngram-size-n", precision=0, step=1, value=shared.args.spec_ngram_size_n, info='N-gram lookup size for speculative decoding.', visible=shared.args.spec_type != 'none')
|
||||
shared.gradio['spec_ngram_size_m'] = gr.Number(label="spec-ngram-size-m", precision=0, step=1, value=shared.args.spec_ngram_size_m, info='Draft n-gram size for speculative decoding.', visible=shared.args.spec_type != 'none')
|
||||
shared.gradio['spec_ngram_min_hits'] = gr.Number(label="spec-ngram-min-hits", precision=0, step=1, value=shared.args.spec_ngram_min_hits, info='Minimum n-gram hits for ngram-map speculative decoding.', visible=shared.args.spec_type != 'none')
|
||||
|
|
|
|||
Loading…
Reference in a new issue