llama.cpp: Reorganize speculative decoding UI and use recommended ngram-mod defaults

This commit is contained in:
oobabooga 2026-03-04 11:37:59 -08:00
parent eb90daf098
commit cdf0e392e6
4 changed files with 14 additions and 13 deletions

View file

@ -81,8 +81,8 @@ group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to
group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of layers to offload to the GPU for the draft model.')
group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
group.add_argument('--spec-type', type=str, default='none', choices=['none', 'ngram-cache', 'ngram-simple', 'ngram-map-k', 'ngram-map-k4v', 'ngram-mod'], help='Speculative decoding type for draftless speculation.')
group.add_argument('--spec-ngram-size-n', type=int, default=12, help='N-gram lookup size for ngram speculative decoding.')
group.add_argument('--spec-type', type=str, default='none', choices=['none', 'ngram-mod', 'ngram-simple', 'ngram-map-k', 'ngram-map-k4v', 'ngram-cache'], help='Draftless speculative decoding type. Recommended: ngram-mod.')
group.add_argument('--spec-ngram-size-n', type=int, default=24, help='N-gram lookup size for ngram speculative decoding.')
group.add_argument('--spec-ngram-size-m', type=int, default=48, help='Draft n-gram size for ngram speculative decoding.')
group.add_argument('--spec-ngram-min-hits', type=int, default=1, help='Minimum n-gram hits for ngram-map speculative decoding.')