@@ -673,17 +673,8 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
673
673
* - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
674
674
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
675
675
*/
676
- std::unordered_set<std::string> seen_args;
677
676
auto add_opt = [&](llama_arg arg) {
678
677
if (arg.in_example (ex) || arg.in_example (LLAMA_EXAMPLE_COMMON)) {
679
- // make sure there is no argument duplications
680
- for (const auto & a : arg.args ) {
681
- if (seen_args.find (a) == seen_args.end ()) {
682
- seen_args.insert (a);
683
- } else {
684
- throw std::runtime_error (format (" found duplicated argument in source code: %s" , a));
685
- }
686
- }
687
678
options.push_back (std::move (arg));
688
679
}
689
680
};
@@ -790,8 +781,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
790
781
add_opt (llama_arg (
791
782
{" -C" , " --cpu-mask" }, " M" ,
792
783
" CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\" )" ,
793
- [](gpt_params & params, const std::string & value) {
794
- std::string mask = value;
784
+ [](gpt_params & params, const std::string & mask) {
795
785
params.cpuparams .mask_valid = true ;
796
786
if (!parse_cpu_mask (mask, params.cpuparams .cpumask )) {
797
787
throw std::invalid_argument (" invalid cpumask" );
@@ -801,8 +791,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
801
791
add_opt (llama_arg (
802
792
{" -Cr" , " --cpu-range" }, " lo-hi" ,
803
793
" range of CPUs for affinity. Complements --cpu-mask" ,
804
- [](gpt_params & params, const std::string & value) {
805
- std::string range = value;
794
+ [](gpt_params & params, const std::string & range) {
806
795
params.cpuparams .mask_valid = true ;
807
796
if (!parse_cpu_range (range, params.cpuparams .cpumask )) {
808
797
throw std::invalid_argument (" invalid range" );
@@ -816,6 +805,16 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
816
805
params.cpuparams .strict_cpu = std::stoul (value);
817
806
}
818
807
));
808
+ add_opt (llama_arg (
809
+ {" --prio" }, " N" ,
810
+ format (" set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n " , params.cpuparams .priority ),
811
+ [](gpt_params & params, int prio) {
812
+ if (prio < 0 || prio > 3 ) {
813
+ throw std::invalid_argument (" invalid value" );
814
+ }
815
+ params.cpuparams .priority = (enum ggml_sched_priority) prio;
816
+ }
817
+ ));
819
818
add_opt (llama_arg (
820
819
{" --poll" }, " <0...100>" ,
821
820
format (" use polling level to wait for work (0 - no polling, default: %u)\n " , (unsigned ) params.cpuparams .poll ),
@@ -826,8 +825,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
826
825
add_opt (llama_arg (
827
826
{" -Cb" , " --cpu-mask-batch" }, " M" ,
828
827
" CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)" ,
829
- [](gpt_params & params, const std::string & value) {
830
- std::string mask = value;
828
+ [](gpt_params & params, const std::string & mask) {
831
829
params.cpuparams_batch .mask_valid = true ;
832
830
if (!parse_cpu_mask (mask, params.cpuparams_batch .cpumask )) {
833
831
throw std::invalid_argument (" invalid cpumask" );
@@ -837,8 +835,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
837
835
add_opt (llama_arg (
838
836
{" -Crb" , " --cpu-range-batch" }, " lo-hi" ,
839
837
" ranges of CPUs for affinity. Complements --cpu-mask-batch" ,
840
- [](gpt_params & params, const std::string & value) {
841
- std::string range = value;
838
+ [](gpt_params & params, const std::string & range) {
842
839
params.cpuparams_batch .mask_valid = true ;
843
840
if (!parse_cpu_range (range, params.cpuparams_batch .cpumask )) {
844
841
throw std::invalid_argument (" invalid range" );
@@ -852,6 +849,16 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
852
849
params.cpuparams_batch .strict_cpu = value;
853
850
}
854
851
));
852
+ add_opt (llama_arg (
853
+ {" --prio-batch" }, " N" ,
854
+ format (" set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n " , params.cpuparams_batch .priority ),
855
+ [](gpt_params & params, int prio) {
856
+ if (prio < 0 || prio > 3 ) {
857
+ throw std::invalid_argument (" invalid value" );
858
+ }
859
+ params.cpuparams_batch .priority = (enum ggml_sched_priority) prio;
860
+ }
861
+ ));
855
862
add_opt (llama_arg (
856
863
{" --poll-batch" }, " <0|1>" ,
857
864
" use polling to wait for work (default: same as --poll)" ,
@@ -862,8 +869,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
862
869
add_opt (llama_arg (
863
870
{" -Cd" , " --cpu-mask-draft" }, " M" ,
864
871
" Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)" ,
865
- [](gpt_params & params, const std::string & value) {
866
- std::string mask = value;
872
+ [](gpt_params & params, const std::string & mask) {
867
873
params.draft_cpuparams .mask_valid = true ;
868
874
if (!parse_cpu_mask (mask, params.draft_cpuparams .cpumask )) {
869
875
throw std::invalid_argument (" invalid cpumask" );
@@ -873,8 +879,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
873
879
add_opt (llama_arg (
874
880
{" -Crd" , " --cpu-range-draft" }, " lo-hi" ,
875
881
" Ranges of CPUs for affinity. Complements --cpu-mask-draft" ,
876
- [](gpt_params & params, const std::string & value) {
877
- std::string range = value;
882
+ [](gpt_params & params, const std::string & range) {
878
883
params.draft_cpuparams .mask_valid = true ;
879
884
if (!parse_cpu_range (range, params.draft_cpuparams .cpumask )) {
880
885
throw std::invalid_argument (" invalid range" );
@@ -888,18 +893,37 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
888
893
params.draft_cpuparams .strict_cpu = value;
889
894
}
890
895
).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
896
+ add_opt (llama_arg (
897
+ {" --prio-draft" }, " N" ,
898
+ format (" set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n " , params.draft_cpuparams .priority ),
899
+ [](gpt_params & params, int prio) {
900
+ if (prio < 0 || prio > 3 ) {
901
+ throw std::invalid_argument (" invalid value" );
902
+ }
903
+ params.draft_cpuparams .priority = (enum ggml_sched_priority) prio;
904
+ }
905
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
891
906
add_opt (llama_arg (
892
907
{" --poll-draft" }, " <0|1>" ,
893
908
" Use polling to wait for draft model work (default: same as --poll])" ,
894
909
[](gpt_params & params, int value) {
895
910
params.draft_cpuparams .poll = value;
896
911
}
897
912
).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
913
+ add_opt (llama_arg (
914
+ {" -Cbd" , " --cpu-mask-batch-draft" }, " M" ,
915
+ " Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)" ,
916
+ [](gpt_params & params, const std::string & mask) {
917
+ params.draft_cpuparams_batch .mask_valid = true ;
918
+ if (!parse_cpu_mask (mask, params.draft_cpuparams_batch .cpumask )) {
919
+ throw std::invalid_argument (" invalid cpumask" );
920
+ }
921
+ }
922
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
898
923
add_opt (llama_arg (
899
924
{" -Crbd" , " --cpu-range-batch-draft" }, " lo-hi" ,
900
925
" Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)" ,
901
- [](gpt_params & params, const std::string & value) {
902
- std::string range = value;
926
+ [](gpt_params & params, const std::string & range) {
903
927
params.draft_cpuparams_batch .mask_valid = true ;
904
928
if (!parse_cpu_range (range, params.draft_cpuparams_batch .cpumask )) {
905
929
throw std::invalid_argument (" invalid cpumask" );
@@ -913,6 +937,16 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
913
937
params.draft_cpuparams_batch .strict_cpu = value;
914
938
}
915
939
).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
940
+ add_opt (llama_arg (
941
+ {" --prio-batch-draft" }, " N" ,
942
+ format (" set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n " , params.draft_cpuparams_batch .priority ),
943
+ [](gpt_params & params, int prio) {
944
+ if (prio < 0 || prio > 3 ) {
945
+ throw std::invalid_argument (" invalid value" );
946
+ }
947
+ params.draft_cpuparams_batch .priority = (enum ggml_sched_priority) prio;
948
+ }
949
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
916
950
add_opt (llama_arg (
917
951
{" --poll-batch-draft" }, " <0|1>" ,
918
952
" Use polling to wait for draft model work (default: --poll-draft)" ,
@@ -1124,45 +1158,45 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
1124
1158
[](gpt_params & params) {
1125
1159
params.interactive = true ;
1126
1160
}
1127
- ).set_examples ({LLAMA_EXAMPLE_INFILL }));
1161
+ ).set_examples ({LLAMA_EXAMPLE_MAIN }));
1128
1162
add_opt (llama_arg (
1129
1163
{" -if" , " --interactive-first" },
1130
1164
format (" run in interactive mode and wait for input right away (default: %s)" , params.interactive_first ? " true" : " false" ),
1131
1165
[](gpt_params & params) {
1132
1166
params.interactive_first = true ;
1133
1167
}
1134
- ).set_examples ({LLAMA_EXAMPLE_INFILL }));
1168
+ ).set_examples ({LLAMA_EXAMPLE_MAIN }));
1135
1169
add_opt (llama_arg (
1136
1170
{" -mli" , " --multiline-input" },
1137
1171
" allows you to write or paste multiple lines without ending each in '\\ '" ,
1138
1172
[](gpt_params & params) {
1139
1173
params.multiline_input = true ;
1140
1174
}
1141
- ).set_examples ({LLAMA_EXAMPLE_INFILL }));
1175
+ ).set_examples ({LLAMA_EXAMPLE_MAIN }));
1142
1176
add_opt (llama_arg (
1143
1177
{" --in-prefix-bos" },
1144
1178
" prefix BOS to user inputs, preceding the `--in-prefix` string" ,
1145
1179
[](gpt_params & params) {
1146
1180
params.input_prefix_bos = true ;
1147
1181
params.enable_chat_template = false ;
1148
1182
}
1149
- ).set_examples ({LLAMA_EXAMPLE_INFILL }));
1183
+ ).set_examples ({LLAMA_EXAMPLE_MAIN }));
1150
1184
add_opt (llama_arg (
1151
1185
{" --in-prefix" }, " STRING" ,
1152
1186
" string to prefix user inputs with (default: empty)" ,
1153
1187
[](gpt_params & params, const std::string & value) {
1154
1188
params.input_prefix = value;
1155
1189
params.enable_chat_template = false ;
1156
1190
}
1157
- ).set_examples ({LLAMA_EXAMPLE_INFILL }));
1191
+ ).set_examples ({LLAMA_EXAMPLE_MAIN }));
1158
1192
add_opt (llama_arg (
1159
1193
{" --in-suffix" }, " STRING" ,
1160
1194
" string to suffix after user inputs with (default: empty)" ,
1161
1195
[](gpt_params & params, const std::string & value) {
1162
1196
params.input_suffix = value;
1163
1197
params.enable_chat_template = false ;
1164
1198
}
1165
- ).set_examples ({LLAMA_EXAMPLE_INFILL }));
1199
+ ).set_examples ({LLAMA_EXAMPLE_MAIN }));
1166
1200
add_opt (llama_arg (
1167
1201
{" --no-warmup" },
1168
1202
" skip warming up the model with an empty run" ,
@@ -1499,7 +1533,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
1499
1533
}
1500
1534
));
1501
1535
add_opt (llama_arg (
1502
- {" --all-logits" },
1536
+ {" --perplexity " , " -- all-logits" },
1503
1537
format (" return logits for all tokens in the batch (default: %s)" , params.logits_all ? " true" : " false" ),
1504
1538
[](gpt_params & params) {
1505
1539
params.logits_all = true ;
@@ -1554,6 +1588,13 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
1554
1588
params.kl_divergence = true ;
1555
1589
}
1556
1590
).set_examples ({LLAMA_EXAMPLE_PERPLEXITY}));
1591
+ add_opt (llama_arg (
1592
+ {" --save-all-logits" , " --kl-divergence-base" }, " FNAME" ,
1593
+ " set logits file" ,
1594
+ [](gpt_params & params, const std::string & value) {
1595
+ params.logits_file = value;
1596
+ }
1597
+ ).set_examples ({LLAMA_EXAMPLE_PERPLEXITY}));
1557
1598
add_opt (llama_arg (
1558
1599
{" --ppl-stride" }, " N" ,
1559
1600
format (" stride for perplexity calculation (default: %d)" , params.ppl_stride ),
@@ -1802,7 +1843,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
1802
1843
[](gpt_params & params, const std::string & value) {
1803
1844
params.model_alias = value;
1804
1845
}
1805
- ).set_examples ({LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_MODEL " ) );
1846
+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1806
1847
add_opt (llama_arg (
1807
1848
{" -m" , " --model" }, " FNAME" ,
1808
1849
ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -1890,7 +1931,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
1890
1931
}
1891
1932
).set_examples ({LLAMA_EXAMPLE_PASSKEY}));
1892
1933
add_opt (llama_arg (
1893
- {" -o" , " --output" }, " FNAME" ,
1934
+ {" -o" , " --output" , " --output-file " }, " FNAME" ,
1894
1935
format (" output file (default: '%s')" ,
1895
1936
ex == LLAMA_EXAMPLE_EXPORT_LORA
1896
1937
? params.lora_outfile .c_str ()
@@ -1932,7 +1973,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
1932
1973
}
1933
1974
).set_examples ({LLAMA_EXAMPLE_IMATRIX}));
1934
1975
add_opt (llama_arg (
1935
- {" --chunk" }, " N" ,
1976
+ {" --chunk" , " --from-chunk " }, " N" ,
1936
1977
format (" start processing the input from chunk N (default: %d)" , params.i_chunk ),
1937
1978
[](gpt_params & params, int value) {
1938
1979
params.i_chunk = value;
@@ -2057,7 +2098,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
2057
2098
}
2058
2099
).set_examples ({LLAMA_EXAMPLE_SERVER}));
2059
2100
add_opt (llama_arg (
2060
- {" --timeout" }, " N" ,
2101
+ {" -to " , " - -timeout" }, " N" ,
2061
2102
format (" server read/write timeout in seconds (default: %d)" , params.timeout_read ),
2062
2103
[](gpt_params & params, int value) {
2063
2104
params.timeout_read = value;
0 commit comments