Skip to content

Commit 1442677

Browse files
authored
common : refactor cli arg parsing (#7675)
* common : gpt_params_parse do not print usage * common : rework usage print (wip) * common : valign * common : rework print_usage * infill : remove cfg support * common : reorder args * server : deduplicate parameters ggml-ci * common : add missing header ggml-ci * common : remote --random-prompt usages ggml-ci * examples : migrate to gpt_params ggml-ci * batched-bench : migrate to gpt_params * retrieval : migrate to gpt_params * common : change defaults for escape and n_ctx * common : remove chatml and instruct params ggml-ci * common : passkey use gpt_params
1 parent 554c247 commit 1442677

File tree

34 files changed

+900
-1456
lines changed

34 files changed

+900
-1456
lines changed

common/common.cpp

Lines changed: 563 additions & 260 deletions
Large diffs are not rendered by default.

common/common.h

Lines changed: 82 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ struct gpt_params {
6060
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
6161
int32_t n_threads_batch_draft = -1;
6262
int32_t n_predict = -1; // new tokens to predict
63-
int32_t n_ctx = 512; // context size
63+
int32_t n_ctx = 0; // context size
6464
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
6565
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
6666
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -99,23 +99,23 @@ struct gpt_params {
9999
// // sampling parameters
100100
struct llama_sampling_params sparams;
101101

102-
std::string model = ""; // model path
103-
std::string model_draft = ""; // draft model for speculative decoding
102+
std::string model = ""; // model path
103+
std::string model_draft = ""; // draft model for speculative decoding
104104
std::string model_alias = "unknown"; // model alias
105-
std::string model_url = ""; // model url to download
106-
std::string hf_repo = ""; // HF repo
107-
std::string hf_file = ""; // HF file
105+
std::string model_url = ""; // model url to download
106+
std::string hf_repo = ""; // HF repo
107+
std::string hf_file = ""; // HF file
108108
std::string prompt = "";
109-
std::string prompt_file = ""; // store the external prompt file name
110-
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
111-
std::string input_prefix = ""; // string to prefix user inputs with
112-
std::string input_suffix = ""; // string to suffix user inputs with
113-
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
114-
std::string logdir = ""; // directory in which to save YAML log files
109+
std::string prompt_file = ""; // store the external prompt file name
110+
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
111+
std::string input_prefix = ""; // string to prefix user inputs with
112+
std::string input_suffix = ""; // string to suffix user inputs with
113+
std::string logdir = ""; // directory in which to save YAML log files
115114
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
116115
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
117-
std::string logits_file = ""; // file for saving *all* logits
116+
std::string logits_file = ""; // file for saving *all* logits
118117

118+
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
119119
std::vector<llama_model_kv_override> kv_overrides;
120120

121121
// TODO: avoid tuple, use struct
@@ -127,8 +127,8 @@ struct gpt_params {
127127
int32_t control_vector_layer_start = -1; // layer range for control vector
128128
int32_t control_vector_layer_end = -1; // layer range for control vector
129129

130-
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
131-
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
130+
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
131+
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
132132
// (which is more convenient to use for plotting)
133133
//
134134
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
@@ -142,30 +142,28 @@ struct gpt_params {
142142

143143
bool kl_divergence = false; // compute KL divergence
144144

145-
bool random_prompt = false; // do not randomize prompt if none provided
145+
bool usage = false; // print usage
146146
bool use_color = false; // use color to distinguish generations and inputs
147-
bool interactive = false; // interactive mode
148-
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
149147
bool special = false; // enable special token output
148+
bool interactive = false; // interactive mode
149+
bool interactive_first = false; // wait for user input immediately
150150
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
151-
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
152151
bool prompt_cache_all = false; // save user input and generations to prompt cache
153152
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
154153

155154
bool embedding = false; // get only sentence embedding
156-
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
157-
bool interactive_first = false; // wait for user input immediately
155+
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
158156
bool multiline_input = false; // reverse the usage of `\`
159157
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
160158
bool cont_batching = true; // insert new sequences for decoding on-the-fly
161159
bool flash_attn = false; // flash attention
162160

163161
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
164162
bool ignore_eos = false; // ignore generated EOS tokens
165-
bool instruct = false; // instruction mode (used for Alpaca models)
166163
bool logits_all = false; // return logits for all tokens in the batch
167164
bool use_mmap = true; // use mmap for faster loads
168165
bool use_mlock = false; // use mlock to keep model in memory
166+
bool verbose = false;
169167
bool verbose_prompt = false; // print prompt tokens before generation
170168
bool display_prompt = true; // print prompt before generation
171169
bool infill = false; // use infill mode
@@ -180,6 +178,47 @@ struct gpt_params {
180178
// multimodal models (see examples/llava)
181179
std::string mmproj = ""; // path to multimodal projector
182180
std::vector<std::string> image; // path to image file(s)
181+
182+
// server params
183+
int32_t port = 8080;
184+
int32_t timeout_read = 600;
185+
int32_t timeout_write = timeout_read;
186+
int32_t n_threads_http = -1;
187+
188+
std::string hostname = "127.0.0.1";
189+
std::string public_path = "";
190+
std::string chat_template = "";
191+
std::string system_prompt = "";
192+
193+
std::vector<std::string> api_keys;
194+
195+
std::string ssl_file_key = "";
196+
std::string ssl_file_cert = "";
197+
198+
bool endpoint_slots = true;
199+
bool endpoint_metrics = false;
200+
201+
bool log_json = false;
202+
203+
std::string slot_save_path;
204+
205+
// batched-bench params
206+
bool is_pp_shared = false;
207+
208+
std::vector<int32_t> n_pp;
209+
std::vector<int32_t> n_tg;
210+
std::vector<int32_t> n_pl;
211+
212+
// retrieval params
213+
std::vector<std::string> context_files; // context files to embed
214+
215+
int32_t chunk_size = 64; // chunk size for context embedding
216+
217+
std::string chunk_separator = "\n"; // chunk separator for context embedding
218+
219+
// passkey params
220+
int32_t n_junk = 250; // number of times to repeat the junk text
221+
int32_t i_pos = -1; // position of the passkey in the junk text
183222
};
184223

185224
void gpt_params_handle_model_default(gpt_params & params);
@@ -199,7 +238,20 @@ std::vector<std::string> string_split(std::string input, char separator);
199238

200239
std::string string_strip(const std::string & str);
201240
std::string string_get_sortable_timestamp();
202-
std::string string_random_prompt(std::mt19937 & rng);
241+
242+
template<class T>
243+
static std::vector<T> string_split(const std::string & str, char delim) {
244+
std::vector<T> values;
245+
std::istringstream str_stream(str);
246+
std::string token;
247+
while (std::getline(str_stream, token, delim)) {
248+
T value;
249+
std::istringstream token_stream(token);
250+
token_stream >> value;
251+
values.push_back(value);
252+
}
253+
return values;
254+
}
203255

204256
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
205257
void string_process_escapes(std::string & input);
@@ -282,6 +334,13 @@ std::string llama_detokenize_bpe(
282334
// defaults to true when model type is SPM, otherwise false.
283335
bool llama_should_add_bos_token(const llama_model * model);
284336

337+
//
338+
// Chat template utils
339+
//
340+
341+
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
342+
bool llama_chat_verify_template(const std::string & tmpl);
343+
285344
//
286345
// KV cache utils
287346
//

examples/batched-bench/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,16 @@ There are 2 modes of operation:
1010
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
1111

1212
```bash
13-
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
13+
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
1414

1515
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
16-
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
16+
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
1717

1818
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
19-
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
19+
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
2020

2121
# custom set of batches
22-
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
22+
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
2323
```
2424

2525
## Sample results

examples/batched-bench/batched-bench.cpp

Lines changed: 20 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
2828
return ret;
2929
}
3030

31-
int main(int argc, char ** argv) {
32-
gpt_params params;
33-
34-
if (argc == 1 || argv[1][0] == '-') {
35-
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
36-
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
37-
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
38-
return 1 ;
39-
}
40-
41-
int n_kv_max = 2048;
42-
int n_batch = 2048;
43-
int n_ubatch = 512;
44-
bool flash_attn = false;
45-
int is_pp_shared = 0;
46-
int n_gpu_layers = 0;
47-
48-
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
49-
std::vector<int> n_tg = { 128, 256, };
50-
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
51-
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
52-
53-
if (argc >= 2) {
54-
params.model = argv[1];
55-
}
56-
57-
if (argc >= 3) {
58-
n_kv_max = std::atoi(argv[2]);
59-
}
60-
61-
if (argc >= 4) {
62-
n_batch = std::atoi(argv[3]);
63-
}
64-
65-
if (argc >= 5) {
66-
n_ubatch = std::atoi(argv[4]);
67-
}
68-
69-
if (argc >= 6) {
70-
flash_attn = std::atoi(argv[5]);
71-
}
31+
static void print_usage(int argc, char ** argv, const gpt_params & params) {
32+
gpt_params_print_usage(argc, argv, params);
7233

73-
if (argc >= 7) {
74-
is_pp_shared = std::atoi(argv[6]);
75-
}
34+
LOG_TEE("\nexample usage:\n");
35+
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
36+
LOG_TEE("\n");
37+
}
7638

77-
if (argc >= 8) {
78-
n_gpu_layers = std::atoi(argv[7]);
79-
}
39+
int main(int argc, char ** argv) {
40+
gpt_params params;
8041

81-
if (argc >= 9) {
82-
n_pp = parse_list(argv[8]);
42+
if (!gpt_params_parse(argc, argv, params)) {
43+
print_usage(argc, argv, params);
44+
return 1;
8345
}
8446

85-
if (argc >= 10) {
86-
n_tg = parse_list(argv[9]);
87-
}
47+
int is_pp_shared = params.is_pp_shared;
8848

89-
if (argc >= 11) {
90-
n_pl = parse_list(argv[10]);
91-
}
49+
std::vector<int> n_pp = params.n_pp;
50+
std::vector<int> n_tg = params.n_tg;
51+
std::vector<int> n_pl = params.n_pl;
9252

9353
// init LLM
9454

@@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
9757

9858
// initialize the model
9959

100-
llama_model_params model_params = llama_model_default_params();
101-
102-
const std::vector<float> t_split(llama_max_devices(), 0.0f);
103-
104-
model_params.n_gpu_layers = n_gpu_layers;
105-
model_params.tensor_split = t_split.data();
60+
llama_model_params model_params = llama_model_params_from_gpt_params(params);
10661

10762
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
10863

@@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
11166
return 1;
11267
}
11368

114-
llama_context_params ctx_params = llama_context_default_params();
115-
116-
ctx_params.seed = 1234;
117-
ctx_params.n_ctx = n_kv_max;
118-
ctx_params.n_batch = n_batch;
119-
ctx_params.n_ubatch = n_ubatch;
120-
ctx_params.flash_attn = flash_attn;
121-
122-
ctx_params.n_threads = params.n_threads;
123-
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
69+
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
12470

12571
// ensure enough sequences are available
12672
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
@@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
13278
return 1;
13379
}
13480

81+
const int32_t n_kv_max = llama_n_ctx(ctx);
82+
13583
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
13684

13785
// decode in batches of ctx_params.n_batch tokens
@@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
175123
}
176124

177125
LOG_TEE("\n");
178-
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
126+
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
179127
LOG_TEE("\n");
180128

181129
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

examples/batched/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
The example demonstrates batched generation from a given prompt
44

55
```bash
6-
./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
6+
./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
77

88
...
99

0 commit comments

Comments
 (0)