@@ -60,7 +60,7 @@ struct gpt_params {
60
60
int32_t n_threads_batch = -1 ; // number of threads to use for batch processing (-1 = use n_threads)
61
61
int32_t n_threads_batch_draft = -1 ;
62
62
int32_t n_predict = -1 ; // new tokens to predict
63
- int32_t n_ctx = 512 ; // context size
63
+ int32_t n_ctx = 0 ; // context size
64
64
int32_t n_batch = 2048 ; // logical batch size for prompt processing (must be >=32 to use BLAS)
65
65
int32_t n_ubatch = 512 ; // physical batch size for prompt processing (must be >=32 to use BLAS)
66
66
int32_t n_keep = 0 ; // number of tokens to keep from initial prompt
@@ -99,23 +99,23 @@ struct gpt_params {
99
99
// // sampling parameters
100
100
struct llama_sampling_params sparams;
101
101
102
- std::string model = " " ; // model path
103
- std::string model_draft = " " ; // draft model for speculative decoding
102
+ std::string model = " " ; // model path
103
+ std::string model_draft = " " ; // draft model for speculative decoding
104
104
std::string model_alias = " unknown" ; // model alias
105
- std::string model_url = " " ; // model url to download
106
- std::string hf_repo = " " ; // HF repo
107
- std::string hf_file = " " ; // HF file
105
+ std::string model_url = " " ; // model url to download
106
+ std::string hf_repo = " " ; // HF repo
107
+ std::string hf_file = " " ; // HF file
108
108
std::string prompt = " " ;
109
- std::string prompt_file = " " ; // store the external prompt file name
110
- std::string path_prompt_cache = " " ; // path to file for saving/loading prompt eval state
111
- std::string input_prefix = " " ; // string to prefix user inputs with
112
- std::string input_suffix = " " ; // string to suffix user inputs with
113
- std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
114
- std::string logdir = " " ; // directory in which to save YAML log files
109
+ std::string prompt_file = " " ; // store the external prompt file name
110
+ std::string path_prompt_cache = " " ; // path to file for saving/loading prompt eval state
111
+ std::string input_prefix = " " ; // string to prefix user inputs with
112
+ std::string input_suffix = " " ; // string to suffix user inputs with
113
+ std::string logdir = " " ; // directory in which to save YAML log files
115
114
std::string lookup_cache_static = " " ; // path of static ngram cache file for lookup decoding
116
115
std::string lookup_cache_dynamic = " " ; // path of dynamic ngram cache file for lookup decoding
117
- std::string logits_file = " " ; // file for saving *all* logits
116
+ std::string logits_file = " " ; // file for saving *all* logits
118
117
118
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
119
119
std::vector<llama_model_kv_override> kv_overrides;
120
120
121
121
// TODO: avoid tuple, use struct
@@ -127,8 +127,8 @@ struct gpt_params {
127
127
int32_t control_vector_layer_start = -1 ; // layer range for control vector
128
128
int32_t control_vector_layer_end = -1 ; // layer range for control vector
129
129
130
- int ppl_stride = 0 ; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
131
- int ppl_output_type = 0 ; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
130
+ int32_t ppl_stride = 0 ; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
131
+ int32_t ppl_output_type = 0 ; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
132
132
// (which is more convenient to use for plotting)
133
133
//
134
134
bool hellaswag = false ; // compute HellaSwag score over random tasks from datafile supplied in prompt
@@ -142,30 +142,28 @@ struct gpt_params {
142
142
143
143
bool kl_divergence = false ; // compute KL divergence
144
144
145
- bool random_prompt = false ; // do not randomize prompt if none provided
145
+ bool usage = false ; // print usage
146
146
bool use_color = false ; // use color to distinguish generations and inputs
147
- bool interactive = false ; // interactive mode
148
- bool interactive_specials = false ; // whether to allow special tokens from user, during interactive mode
149
147
bool special = false ; // enable special token output
148
+ bool interactive = false ; // interactive mode
149
+ bool interactive_first = false ; // wait for user input immediately
150
150
bool conversation = false ; // conversation mode (does not print special tokens and suffix/prefix)
151
- bool chatml = false ; // chatml mode (used for models trained on chatml syntax)
152
151
bool prompt_cache_all = false ; // save user input and generations to prompt cache
153
152
bool prompt_cache_ro = false ; // open the prompt cache read-only and do not update it
154
153
155
154
bool embedding = false ; // get only sentence embedding
156
- bool escape = false ; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
157
- bool interactive_first = false ; // wait for user input immediately
155
+ bool escape = true ; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
158
156
bool multiline_input = false ; // reverse the usage of `\`
159
157
bool simple_io = false ; // improves compatibility with subprocesses and limited consoles
160
158
bool cont_batching = true ; // insert new sequences for decoding on-the-fly
161
159
bool flash_attn = false ; // flash attention
162
160
163
161
bool input_prefix_bos = false ; // prefix BOS to user inputs, preceding input_prefix
164
162
bool ignore_eos = false ; // ignore generated EOS tokens
165
- bool instruct = false ; // instruction mode (used for Alpaca models)
166
163
bool logits_all = false ; // return logits for all tokens in the batch
167
164
bool use_mmap = true ; // use mmap for faster loads
168
165
bool use_mlock = false ; // use mlock to keep model in memory
166
+ bool verbose = false ;
169
167
bool verbose_prompt = false ; // print prompt tokens before generation
170
168
bool display_prompt = true ; // print prompt before generation
171
169
bool infill = false ; // use infill mode
@@ -180,6 +178,47 @@ struct gpt_params {
180
178
// multimodal models (see examples/llava)
181
179
std::string mmproj = " " ; // path to multimodal projector
182
180
std::vector<std::string> image; // path to image file(s)
181
+
182
+ // server params
183
+ int32_t port = 8080 ;
184
+ int32_t timeout_read = 600 ;
185
+ int32_t timeout_write = timeout_read;
186
+ int32_t n_threads_http = -1 ;
187
+
188
+ std::string hostname = " 127.0.0.1" ;
189
+ std::string public_path = " " ;
190
+ std::string chat_template = " " ;
191
+ std::string system_prompt = " " ;
192
+
193
+ std::vector<std::string> api_keys;
194
+
195
+ std::string ssl_file_key = " " ;
196
+ std::string ssl_file_cert = " " ;
197
+
198
+ bool endpoint_slots = true ;
199
+ bool endpoint_metrics = false ;
200
+
201
+ bool log_json = false ;
202
+
203
+ std::string slot_save_path;
204
+
205
+ // batched-bench params
206
+ bool is_pp_shared = false ;
207
+
208
+ std::vector<int32_t > n_pp;
209
+ std::vector<int32_t > n_tg;
210
+ std::vector<int32_t > n_pl;
211
+
212
+ // retrieval params
213
+ std::vector<std::string> context_files; // context files to embed
214
+
215
+ int32_t chunk_size = 64 ; // chunk size for context embedding
216
+
217
+ std::string chunk_separator = " \n " ; // chunk separator for context embedding
218
+
219
+ // passkey params
220
+ int32_t n_junk = 250 ; // number of times to repeat the junk text
221
+ int32_t i_pos = -1 ; // position of the passkey in the junk text
183
222
};
184
223
185
224
void gpt_params_handle_model_default (gpt_params & params);
@@ -199,7 +238,20 @@ std::vector<std::string> string_split(std::string input, char separator);
199
238
200
239
std::string string_strip (const std::string & str);
201
240
std::string string_get_sortable_timestamp ();
202
- std::string string_random_prompt (std::mt19937 & rng);
241
+
242
+ template <class T >
243
+ static std::vector<T> string_split (const std::string & str, char delim) {
244
+ std::vector<T> values;
245
+ std::istringstream str_stream (str);
246
+ std::string token;
247
+ while (std::getline (str_stream, token, delim)) {
248
+ T value;
249
+ std::istringstream token_stream (token);
250
+ token_stream >> value;
251
+ values.push_back (value);
252
+ }
253
+ return values;
254
+ }
203
255
204
256
bool string_parse_kv_override (const char * data, std::vector<llama_model_kv_override> & overrides);
205
257
void string_process_escapes (std::string & input);
@@ -282,6 +334,13 @@ std::string llama_detokenize_bpe(
282
334
// defaults to true when model type is SPM, otherwise false.
283
335
bool llama_should_add_bos_token (const llama_model * model);
284
336
337
+ //
338
+ // Chat template utils
339
+ //
340
+
341
+ // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
342
+ bool llama_chat_verify_template (const std::string & tmpl);
343
+
285
344
//
286
345
// KV cache utils
287
346
//
0 commit comments