@@ -609,7 +609,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
609
609
[](common_params & params, int value) {
610
610
params.n_draft = value;
611
611
}
612
- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
612
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
613
+ add_opt (common_arg (
614
+ {" --draft-min" }, " N" ,
615
+ string_format (" minimum number of draft tokens to use for speculative decoding (default: %d)" , params.n_draft_min ),
616
+ [](common_params & params, int value) {
617
+ params.n_draft_min = value;
618
+ }
619
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
613
620
add_opt (common_arg (
614
621
{" -ps" , " --p-split" }, " N" ,
615
622
string_format (" speculative decoding split probability (default: %.1f)" , (double )params.p_split ),
@@ -1454,7 +1461,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1454
1461
fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
1455
1462
}
1456
1463
}
1457
- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
1464
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER }));
1458
1465
add_opt (common_arg (
1459
1466
{" -sm" , " --split-mode" }, " {none,layer,row}" ,
1460
1467
" how to split the model across multiple GPUs, one of:\n "
@@ -1599,7 +1606,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1599
1606
[](common_params & params, const std::string & value) {
1600
1607
params.model_draft = value;
1601
1608
}
1602
- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
1609
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER }));
1603
1610
add_opt (common_arg (
1604
1611
{" -mu" , " --model-url" }, " MODEL_URL" ,
1605
1612
" model download url (default: unused)" ,
0 commit comments